# Linear Regressions
---
Now that we have clusters of the questions into different types of groups, we will be using them to generate linear models to see how they predict the usage of language.

Each model will include:
- How many years have you been learning/utilizing your learned language?
- Do you speak this learned language at home?
- Is this learned language spoken in your home?

To prevent correlations.

In [1]:
# Import Packages
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import json

# Stats Packages
from statsmodels.formula.api import ols
from statsmodels.miscmodels.ordinal_model import OrderedModel

# Import Data
df = pd.read_csv('cleaned_data.csv')
all_motives = json.load(open('motivator_clusters.json'))

family_motives        = all_motives['family']
education_motives     = all_motives['education']
media_motives         = all_motives['media']
self_improve_motives  = all_motives['self_improve']
culture_motives       = all_motives['culture']
citizenship_motives   = all_motives['citizenship']
uncategorized_motives = all_motives['uncategorized']

rank_only             = ['rank_family', 'rank_education', 'rank_media', 
                         'rank_improvement', 'rank_culture', 'rank_citizenship']

In [96]:
# Collection of all the motivator questions
full_cols = family_motives + education_motives + media_motives + self_improve_motives + culture_motives + citizenship_motives + uncategorized_motives

In [97]:
# Current values for rank is 1 to 6 with 1 being the most important
# Flip the values so that 1 is the least important and 6 is the most important
df[rank_only] = df[rank_only].apply(lambda x: 7 - x)

In [194]:
# Get all the columns of the dataframe which have 'feel_' in it 
feel_cols = [col for col in df.columns if 'feel_' in col]
# Remove 'feel_current_grade' and 'feel_expected_grade' from the list
feel_cols.remove('feel_current_grade')
feel_cols.remove('feel_expected_grade')

# Get the columns of our possible controlling
controlling_list = ['demo_num_lang', 'demo_years_learning', 'demo_home_speaker', 'demo_home_spoken']
controlling = ' + '.join(controlling_list)

In [99]:
def computeModel(dependent: list[str], motives: list[str], dataframe=df) -> list:
    '''
    Generates a list of the ols models computed for the dependent variable and formula

    Parameters
    ----------
    dependent : str
        The dependent variable
    
    formula : str
        The formula to use for the ols model
    
    Returns
    -------
    list
        A list of the ols models computed for the dependent variable and formula
    '''
    global controlling
    models = []
    formula = '~ 1 +' + ' + '.join(motives) + ' + ' + controlling
    for col in dependent:
        model = ols((col + formula), data=dataframe).fit()
        # Check to see if any of the p-values for the motives are less than 0.05
        if any(model.pvalues[1:-4] < 0.05):
            models.append(model)
    return models

In [100]:
family_models        = computeModel(feel_cols, family_motives)
education_models     = computeModel(feel_cols, education_motives)
media_models         = computeModel(feel_cols, media_motives)
self_improve_models  = computeModel(feel_cols, self_improve_motives)
culture_models       = computeModel(feel_cols, culture_motives)
citizenship_models   = computeModel(feel_cols, citizenship_motives)
uncategorized_models = computeModel(feel_cols, uncategorized_motives)

In [101]:
# Make a composite score which is the average of the sum of scores from each motives
df['family_score']        = df[family_motives].sum(axis=1)/len(family_motives)
df['education_score']     = df[education_motives].sum(axis=1)/len(education_motives)
df['media_score']         = df[media_motives].sum(axis=1)/len(media_motives)
df['self_improve_score']  = df[self_improve_motives].sum(axis=1)/len(self_improve_motives)
df['culture_score']       = df[culture_motives].sum(axis=1)/len(culture_motives)
df['citizenship_score']   = df[citizenship_motives].sum(axis=1)/len(citizenship_motives)
df['uncategorized_score'] = df[uncategorized_motives].sum(axis=1)/len(uncategorized_motives)


# Make one large model for all the motives
composite_scores = ['family_score', 'education_score', 'media_score', 'self_improve_score', 'culture_score', 'citizenship_score', 'uncategorized_score']

all_models = computeModel(feel_cols, composite_scores)

In [102]:
# Make a "full" model which uses every motive and rank as a predictor
full_model = computeModel(feel_cols, family_motives + education_motives + media_motives + self_improve_motives + culture_motives + citizenship_motives + uncategorized_motives + rank_only)

In [103]:
len(full_model)

11

In [104]:
(full_model[10].summary())

0,1,2,3
Dep. Variable:,feel_comfortable_listening,R-squared:,0.629
Model:,OLS,Adj. R-squared:,0.38
Method:,Least Squares,F-statistic:,2.525
Date:,"Mon, 12 Jun 2023",Prob (F-statistic):,0.000899
Time:,14:27:31,Log-Likelihood:,-166.13
No. Observations:,93,AIC:,408.3
Df Residuals:,55,BIC:,504.5
Df Model:,37,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0478,0.032,1.490,0.142,-0.016,0.112
motivator_family_continue,0.2093,0.142,1.477,0.145,-0.075,0.493
motivator_family_encourage,-0.2071,0.171,-1.208,0.232,-0.551,0.136
motivator_family_stressed,0.0875,0.112,0.779,0.439,-0.138,0.312
motivator_family_important,-0.0756,0.211,-0.358,0.722,-0.499,0.348
motivator_required,0.0360,0.081,0.444,0.659,-0.127,0.199
motivator_choose_not,-0.1441,0.138,-1.047,0.300,-0.420,0.132
motivator_nervous,-0.2875,0.105,-2.751,0.008,-0.497,-0.078
motivator_difficult,-0.0509,0.117,-0.435,0.665,-0.285,0.183

0,1,2,3
Omnibus:,1.47,Durbin-Watson:,1.938
Prob(Omnibus):,0.479,Jarque-Bera (JB):,1.296
Skew:,-0.131,Prob(JB):,0.523
Kurtosis:,2.484,Cond. No.,5.77e+17


In [105]:
rank_model = computeModel(feel_cols, rank_only)

### Summary
---
Viewing all the OLS Models for each cluster of motivating questions as well as the ranks, we find that they are all hovering at a very low rate of explanatory variation, indicated by the $R^2$ value. The model which appears to perform the best is the Full Model, which makes intuitive sense as more predictors would typically entail more variation being accounted for. However, this in combination with the cluster analysis demonstrating that the Ranks $\neq$ Motivatoring Questions, it may be more logical to progress forward using the entire model.

# Engagement Predictor
---
The next following OLS will be to oversee how students engage with the language course material relative to their motivators.

In [116]:
# Get all the columns of the dataframe which have 'engage_' in it 
engage_cols = [col for col in df.columns if 'engage_' in col]

engage_df = df.copy()

In [117]:
# Change the values from the engage columns to respective categories
engage_dict = {1: "Very Often", 2: "Often", 3: "Sometimes", 4: "Occassionally", 5: "Infrequently", 6: "Rarely", 7: "Never"}

engage_df[engage_cols] = engage_df[engage_cols].replace(engage_dict)

# Convert the engage columns to categorical data type
engage_df[engage_cols] = engage_df[engage_cols]\
                        .astype(CategoricalDtype(categories=list(reversed(engage_dict.values())), 
                                                 ordered=True))

In [128]:
engage_df['engage_attend_class'].unique()

['Very Often', 'Often', 'Never', 'Sometimes']
Categories (7, object): ['Never' < 'Rarely' < 'Infrequently' < 'Occassionally' < 'Sometimes' < 'Often' < 'Very Often']

In [143]:
engage_cols

['engage_attend_class',
 'engage_participate_class',
 'engage_apps',
 'engage_practice_others',
 'engage_listen',
 'engage_read',
 'engage_watch']

In [164]:
ordered_formula = engage_cols[3] + ' ~ -1' + ' + ' + controlling + ' + ' + ' + '.join(full_cols) + ' + ' + ' + '.join(rank_only)

In [165]:
ordered_formula

'engage_practice_others ~ -1 + demo_num_lang + demo_years_learning + demo_home_speaker + demo_home_spoken + motivator_family_continue + motivator_family_encourage + motivator_family_stressed + motivator_family_important + motivator_required + motivator_choose_not + motivator_nervous + motivator_difficult + motivator_newspapers + motivator_tv + motivator_understand + motivator_speak + motivator_many + motivator_practical + motivator_career + motivator_job + motivator_educated + motivator_converse + motivator_understand_cultural + motivator_interact + motivator_great + motivator_communicate + motivator_natural + motivator_fluent + motivator_aspects + motivator_choose + motivator_friends + motivator_read + rank_family + rank_education + rank_media + rank_improvement + rank_culture + rank_citizenship'

In [168]:
model = OrderedModel.from_formula(ordered_formula, engage_df, hasconst=False).fit(method='bfgs')
model.summary()

Optimization terminated successfully.
         Current function value: 1.257584
         Iterations: 52
         Function evaluations: 58
         Gradient evaluations: 58


0,1,2,3
Dep. Variable:,engage_practice_others,Log-Likelihood:,-116.96
Model:,OrderedModel,AIC:,321.9
Method:,Maximum Likelihood,BIC:,433.3
Date:,"Mon, 12 Jun 2023",,
Time:,14:41:32,,
No. Observations:,93,,
Df Residuals:,49,,
Df Model:,44,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
demo_num_lang,-0.4986,0.220,-2.262,0.024,-0.931,-0.067
demo_years_learning,0.0004,0.070,0.006,0.995,-0.136,0.137
demo_home_speaker,0.2569,0.583,0.441,0.660,-0.886,1.400
demo_home_spoken,-0.5812,0.597,-0.974,0.330,-1.751,0.589
motivator_family_continue,-0.0557,0.082,-0.683,0.495,-0.216,0.104
motivator_family_encourage,-0.2335,0.103,-2.263,0.024,-0.436,-0.031
motivator_family_stressed,0.0402,0.067,0.603,0.547,-0.091,0.171
motivator_family_important,0.1564,0.123,1.267,0.205,-0.085,0.398
motivator_required,0.0032,0.047,0.067,0.946,-0.089,0.095


In [190]:
# Create an ordered model with the engagement columns as dependent and full model as the independent
engage_models = []

for col in engage_cols:
    formula = col + ' ~ 1 +' + ' + '.join(full_cols) + ' + ' + ' + '.join(rank_only) + ' + ' + controlling
    model = OrderedModel.from_formula(formula, engage_df, hasconst=False, distr='logit').fit(method='bfgs')
    # Check to see if any of the p-values for the motives are less than 0.05
    if any(model.pvalues[0:-10] < 0.05):
        engage_models.append(model)

  np.log(np.diff(params[:-1]))))
  grad[k, :] = (f(*((x+ei,)+args), **kwargs) -
  hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs)
  - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs)


         Current function value: 0.500285
         Iterations: 0
         Function evaluations: 1
         Gradient evaluations: 1


  - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs)
  np.log(np.diff(params[:-1]))))
  grad[k, :] = (f(*((x+ei,)+args), **kwargs) -
  hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs)
  - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs)


         Current function value: 1.118372
         Iterations: 0
         Function evaluations: 1
         Gradient evaluations: 1


  - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs)


Optimization terminated successfully.
         Current function value: 1.471434
         Iterations: 55
         Function evaluations: 59
         Gradient evaluations: 59
Optimization terminated successfully.
         Current function value: 1.257521
         Iterations: 58
         Function evaluations: 63
         Gradient evaluations: 63




Optimization terminated successfully.
         Current function value: 1.488765
         Iterations: 58
         Function evaluations: 63
         Gradient evaluations: 63
Optimization terminated successfully.
         Current function value: 1.634311
         Iterations: 50
         Function evaluations: 55
         Gradient evaluations: 55




Optimization terminated successfully.
         Current function value: 1.439855
         Iterations: 52
         Function evaluations: 56
         Gradient evaluations: 56


In [191]:
len(engage_models)

3

In [193]:
engage_models[1].summary()

0,1,2,3
Dep. Variable:,engage_listen,Log-Likelihood:,-138.46
Model:,OrderedModel,AIC:,364.9
Method:,Maximum Likelihood,BIC:,476.3
Date:,"Mon, 12 Jun 2023",,
Time:,14:54:26,,
No. Observations:,93,,
Df Residuals:,49,,
Df Model:,44,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
motivator_family_continue,-0.1711,0.138,-1.239,0.215,-0.442,0.100
motivator_family_encourage,-0.3705,0.172,-2.157,0.031,-0.707,-0.034
motivator_family_stressed,0.1949,0.117,1.669,0.095,-0.034,0.424
motivator_family_important,0.0703,0.205,0.342,0.732,-0.332,0.473
motivator_required,0.0470,0.082,0.571,0.568,-0.114,0.208
motivator_choose_not,-0.1454,0.137,-1.058,0.290,-0.415,0.124
motivator_nervous,0.1397,0.107,1.303,0.193,-0.070,0.350
motivator_difficult,-0.0091,0.122,-0.075,0.940,-0.248,0.230
motivator_newspapers,-0.2574,0.127,-2.030,0.042,-0.506,-0.009


### Summary
---
There appears to be no significant relationship between each motivator and how students engage with material.

# Final OLS
---
For the final series of linear regressions, we'll see how motivators affect how people use their languages.

In [15]:
# Get all the columns with 'use_' in the name
use_cols = [col for col in df.columns if 'use_' in col]

In [16]:
# Use a temporary df which does not contain outliers
temp_df = df.copy()

# Remove outliers from the dataframe based on use_cols values
for col in use_cols:
    temp_df = temp_df[np.abs(temp_df[col] - temp_df[col].mean()) <= (3 * temp_df[col].std())]

In [17]:
# Perform an OLS on the use_cols with the full model as the independent
use_models = computeModel(use_cols, full_cols + rank_only, dataframe=temp_df)

In [18]:
len(use_models)

5

In [19]:
use_models[4].summary()

0,1,2,3
Dep. Variable:,use_duolingo_usage,R-squared:,0.544
Model:,OLS,Adj. R-squared:,0.16
Method:,Least Squares,F-statistic:,1.419
Date:,"Mon, 12 Jun 2023",Prob (F-statistic):,0.133
Time:,14:06:27,Log-Likelihood:,-260.99
No. Observations:,82,AIC:,598.0
Df Residuals:,44,BIC:,689.4
Df Model:,37,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0428,0.152,0.281,0.780,-0.264,0.350
motivator_family_continue,-0.3759,0.749,-0.502,0.618,-1.885,1.133
motivator_family_encourage,0.4956,0.841,0.589,0.559,-1.200,2.191
motivator_family_stressed,-0.7902,0.550,-1.436,0.158,-1.899,0.319
motivator_family_important,1.0448,1.114,0.938,0.354,-1.201,3.290
motivator_required,-0.1321,0.374,-0.353,0.726,-0.886,0.622
motivator_choose_not,-0.4103,0.638,-0.643,0.523,-1.696,0.875
motivator_nervous,0.5926,0.501,1.182,0.244,-0.418,1.603
motivator_difficult,0.0340,0.571,0.060,0.953,-1.116,1.184

0,1,2,3
Omnibus:,4.812,Durbin-Watson:,2.047
Prob(Omnibus):,0.09,Jarque-Bera (JB):,4.072
Skew:,0.514,Prob(JB):,0.131
Kurtosis:,3.369,Cond. No.,2.9e+17


In [2]:
# Read in the decode_dict.json file
decode_dict = json.load(open('decode_dict.json'))

In [14]:
decode_dict['motivator_many']

'I would really like to learn many foreign languages.'

# Last Ordinal Logistic Regression
--- 
We will conduct one last analysis which investigates how motivators affect student perception of their current and expected performance within a class.

In [201]:
# Get the columns for 'feel_current_grade' and 'feel_expected_grade'
grade_cols = ['feel_current_grade', 'feel_expected_grade']

# Drop all rows which don't use A, B, C, D, or F
grade_df = df.copy()
grade_df = grade_df[grade_df['feel_current_grade'].isin(['A', 'B', 'C', 'D', 'F'])]
grade_df = grade_df[grade_df['feel_expected_grade'].isin(['A', 'B', 'C', 'D', 'F'])]

In [203]:
grade_df['feel_current_grade'].value_counts()

feel_current_grade
A    54
B    28
C     2
Name: count, dtype: int64

In [204]:
grade_df['feel_expected_grade'].value_counts()

feel_expected_grade
A    54
B    24
C     6
Name: count, dtype: int64

In [209]:
# Set an ordered categorical type for the grades
grade_df['feel_current_grade'] = pd.Categorical(grade_df['feel_current_grade'], categories=['A', 'B', 'C'], ordered=True)

grade_df['feel_expected_grade'] = pd.Categorical(grade_df['feel_expected_grade'], categories=['A', 'B', 'C'], ordered=True)

In [216]:
# Construct an ordered model for the current grade
current_grade_model = OrderedModel(grade_df['feel_current_grade'], 
                                   grade_df[full_cols + rank_only + controlling_list],
                                   hasconst=False).fit(method='bfgs')
current_grade_model.summary()

Optimization terminated successfully.
         Current function value: 0.258486
         Iterations: 104
         Function evaluations: 107
         Gradient evaluations: 107


0,1,2,3
Dep. Variable:,feel_current_grade,Log-Likelihood:,-21.713
Model:,OrderedModel,AIC:,123.4
Method:,Maximum Likelihood,BIC:,220.7
Date:,"Mon, 12 Jun 2023",,
Time:,15:01:40,,
No. Observations:,84,,
Df Residuals:,44,,
Df Model:,40,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
motivator_family_continue,0.3639,0.324,1.122,0.262,-0.272,1.000
motivator_family_encourage,-1.3226,0.745,-1.776,0.076,-2.782,0.137
motivator_family_stressed,-0.3700,0.320,-1.158,0.247,-0.996,0.256
motivator_family_important,0.7969,0.499,1.597,0.110,-0.181,1.775
motivator_required,0.9357,0.581,1.609,0.108,-0.204,2.075
motivator_choose_not,-1.3144,0.800,-1.643,0.100,-2.882,0.253
motivator_nervous,0.5947,0.419,1.421,0.155,-0.226,1.415
motivator_difficult,1.5374,0.911,1.688,0.091,-0.248,3.323
motivator_newspapers,-0.8395,0.465,-1.807,0.071,-1.750,0.071


In [215]:
# Construct an ordered model for the current grade
expected_grade_model = OrderedModel(grade_df['feel_expected_grade'], 
                                   grade_df[full_cols + rank_only + controlling_list],
                                   hasconst=False).fit(method='bfgs')
expected_grade_model.summary()

Optimization terminated successfully.
         Current function value: 0.407930
         Iterations: 60
         Function evaluations: 63
         Gradient evaluations: 63


0,1,2,3
Dep. Variable:,feel_expected_grade,Log-Likelihood:,-34.266
Model:,OrderedModel,AIC:,148.5
Method:,Maximum Likelihood,BIC:,245.8
Date:,"Mon, 12 Jun 2023",,
Time:,14:59:06,,
No. Observations:,84,,
Df Residuals:,44,,
Df Model:,40,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
motivator_family_continue,0.1427,0.141,1.013,0.311,-0.134,0.419
motivator_family_encourage,-0.1068,0.203,-0.527,0.598,-0.504,0.290
motivator_family_stressed,0.0806,0.124,0.649,0.516,-0.163,0.324
motivator_family_important,-0.0340,0.227,-0.150,0.881,-0.479,0.411
motivator_required,0.0620,0.096,0.646,0.518,-0.126,0.250
motivator_choose_not,-0.4899,0.218,-2.246,0.025,-0.917,-0.062
motivator_nervous,-0.0665,0.137,-0.484,0.628,-0.336,0.203
motivator_difficult,0.5855,0.160,3.658,0.000,0.272,0.899
motivator_newspapers,-0.2476,0.145,-1.711,0.087,-0.531,0.036
