# Linear Regressions
---
Now that we have clusters of the questions into different types of groups, we will be using them to generate linear models to see how they predict the usage of language.

Each model will include:
- How many years have you been learning/utilizing your learned language?
- Do you speak this learned language at home?
- Is this learned language spoken in your home?

To prevent correlations.

In [15]:
# Import Packages
import pandas as pd
import numpy as np
import json

# Plotting Packages
import matplotlib.pyplot as plt
import seaborn as sns

# Stats Packages (OLS/Linear Regression)
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

# Import Data
df = pd.read_csv('cleaned_data.csv')
all_motives = json.load(open('motivator_clusters.json'))

family_motives        = all_motives['family'] + ['rank_family']
education_motives     = all_motives['education'] + ['rank_education']
media_motives         = all_motives['media'] + ['rank_media']
self_improve_motives  = all_motives['self_improve'] + ['rank_improvement']
culture_motives       = all_motives['culture'] + ['rank_culture']
citizenship_motives   = all_motives['citizenship'] + ['rank_citizenship']
uncategorized_motives = all_motives['uncategorized']

rank_only             = ['rank_family', 'rank_education', 'rank_media', 'rank_improvement', 'rank_culture', 'rank_citizenship']

In [16]:
# Current values for rank is 1 to 6 with 1 being the most important
# Flip the values so that 1 is the least important and 6 is the most important
df[rank_only] = df[rank_only].apply(lambda x: 7 - x)

In [17]:
# Get all the columns of the dataframe which have 'feel_' in it 
feel_cols = [col for col in df.columns if 'feel_' in col]
# Remove 'feel_current_grade' and 'feel_expected_grade' from the list
feel_cols.remove('feel_current_grade')
feel_cols.remove('feel_expected_grade')

# Get the columns of our possible confounds
confounds = ['demo_years_learning', 'demo_home_speaker', 'demo_home_spoken']
confounds = ' + '.join(confounds)

In [18]:
# Remove outliers from the dataframe based on feel_cols values
for col in feel_cols:
    df = df[np.abs(df[col] - df[col].mean()) <= (3 * df[col].std())]

In [20]:
# Formulas
family_eq        = '~ 1 + ' + ' + '.join(family_motives) + ' + ' + confounds
education_eq     = '~ 1 + ' + ' + '.join(education_motives) + ' + ' + confounds
media_eq         = '~ 1 + ' + ' + '.join(media_motives) + ' + ' + confounds
self_improve_eq  = '~ 1 + ' + ' + '.join(self_improve_motives) + ' + ' + confounds
culture_eq       = '~ 1 + ' + ' + '.join(culture_motives) + ' + ' + confounds
citizenship_eq   = '~ 1 + ' + ' + '.join(citizenship_motives) + ' + ' + confounds
uncategorized_eq = '~ 1 + ' + ' + '.join(uncategorized_motives) + ' + ' + confounds

In [21]:
def computeModel(dependent: list[str], motives: list[str]) -> list:
    '''
    Generates a list of the ols models computed for the dependent variable and formula

    Parameters
    ----------
    dependent : str
        The dependent variable
    
    formula : str
        The formula to use for the ols model
    
    Returns
    -------
    list
        A list of the ols models computed for the dependent variable and formula
    '''
    global confounds
    models = []
    formula = '~ 1 +' + ' + '.join(motives) + ' + ' + confounds
    for col in dependent:
        model = ols((col + formula), data=df).fit()
        # Check to see if any of the p-values for the motives are less than 0.05
        if any(model.pvalues[1:-3] < 0.05):
            models.append(model)
    return models

In [22]:
def getR2(models: list) -> list:
    '''
    Generates a list of the r2 values for the models

    Parameters
    ----------
    models : list
        A list of the ols models
    
    Returns
    -------
    list
        A list of the r2 values for the models
    '''
    r2 = []
    for model in models:
        r2.append(model.rsquared)
    return r2

In [23]:
def getFValues(models: list) -> list:
    '''
    Generates a list of the f values for the models

    Parameters
    ----------
    models : list
        A list of the ols models
    
    Returns
    -------
    list
        A list of the f values for the models
    '''
    fvalues = []
    for model in models:
        fvalues.append(model.fvalue)
    return fvalues

In [24]:
family_models        = computeModel(feel_cols, family_motives)
education_models     = computeModel(feel_cols, education_motives)
media_models         = computeModel(feel_cols, media_motives)
self_improve_models  = computeModel(feel_cols, self_improve_motives)
culture_models       = computeModel(feel_cols, culture_motives)
citizenship_models   = computeModel(feel_cols, citizenship_motives)
uncategorized_models = computeModel(feel_cols, uncategorized_motives)

In [33]:
print(family_models[1].summary())

                               OLS Regression Results                               
Dep. Variable:     feel_continue_structured   R-squared:                       0.241
Model:                                  OLS   Adj. R-squared:                  0.166
Method:                       Least Squares   F-statistic:                     3.183
Date:                      Sun, 11 Jun 2023   Prob (F-statistic):            0.00348
Time:                              16:59:10   Log-Likelihood:                -183.45
No. Observations:                        89   AIC:                             384.9
Df Residuals:                            80   BIC:                             407.3
Df Model:                                 8                                         
Covariance Type:                  nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------

In [26]:
print(education_models[3].summary())

                              OLS Regression Results                              
Dep. Variable:     feel_considered_fluent   R-squared:                       0.588
Model:                                OLS   Adj. R-squared:                  0.547
Method:                     Least Squares   F-statistic:                     14.30
Date:                    Sun, 11 Jun 2023   Prob (F-statistic):           9.97e-13
Time:                            16:57:12   Log-Likelihood:                -185.89
No. Observations:                      89   AIC:                             389.8
Df Residuals:                          80   BIC:                             412.2
Df Model:                               8                                         
Covariance Type:                nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------

In [29]:
print(media_models[6].summary())

                                OLS Regression Results                                
Dep. Variable:     feel_comfortable_listening   R-squared:                       0.368
Model:                                    OLS   Adj. R-squared:                  0.297
Method:                         Least Squares   F-statistic:                     5.122
Date:                        Sun, 11 Jun 2023   Prob (F-statistic):           1.86e-05
Time:                                16:58:53   Log-Likelihood:                -180.85
No. Observations:                          89   AIC:                             381.7
Df Residuals:                              79   BIC:                             406.6
Df Model:                                   9                                         
Covariance Type:                    nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [163]:
print(self_improve_models[0].summary())

                               OLS Regression Results                               
Dep. Variable:     feel_small_conversations   R-squared:                       0.271
Model:                                  OLS   Adj. R-squared:                  0.211
Method:                       Least Squares   F-statistic:                     4.509
Date:                      Fri, 09 Jun 2023   Prob (F-statistic):           0.000260
Time:                              13:06:05   Log-Likelihood:                -199.92
No. Observations:                        93   AIC:                             415.8
Df Residuals:                            85   BIC:                             436.1
Df Model:                                 7                                         
Covariance Type:                  nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------

In [189]:
total = df[family_motives].sum(axis=1)
#total
total/(len(family_motives))

0     7.8
1     4.6
2     8.2
3     8.2
4     9.2
     ... 
88    7.2
89    6.2
90    8.2
91    3.2
92    7.4
Length: 93, dtype: float64

In [212]:
# Make a composite score which is the average of the sum of scores from each motives
df['family_score']        = df[family_motives].sum(axis=1)/len(family_motives)
df['education_score']     = df[education_motives].sum(axis=1)/len(education_motives)
df['media_score']         = df[media_motives].sum(axis=1)/len(media_motives)
df['self_improve_score']  = df[self_improve_motives].sum(axis=1)/len(self_improve_motives)
df['culture_score']       = df[culture_motives].sum(axis=1)/len(culture_motives)
df['citizenship_score']   = df[citizenship_motives].sum(axis=1)/len(citizenship_motives)
df['uncategorized_score'] = df[uncategorized_motives].sum(axis=1)/len(uncategorized_motives)


# Make one large model for all the motives
composite_scores = ['family_score', 'education_score', 'media_score', 'self_improve_score', 'culture_score', 'citizenship_score', 'uncategorized_score']

all_models = computeModel(feel_cols, composite_scores)

In [231]:
print(all_models[2].summary())

                                 OLS Regression Results                                
Dep. Variable:     feel_adequate_conversations   R-squared:                       0.487
Model:                                     OLS   Adj. R-squared:                  0.425
Method:                          Least Squares   F-statistic:                     7.790
Date:                         Fri, 09 Jun 2023   Prob (F-statistic):           1.19e-08
Time:                                 15:07:12   Log-Likelihood:                -196.75
No. Observations:                           93   AIC:                             415.5
Df Residuals:                               82   BIC:                             443.4
Df Model:                                   10                                         
Covariance Type:                     nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------

In [215]:
# Make a "crazy" model which uses every motive and rank as a predictor
crazy_models = computeModel(feel_cols, family_motives + education_motives + media_motives + self_improve_motives + culture_motives + citizenship_motives + uncategorized_motives)

In [228]:
print(crazy_models[10].summary())

                                OLS Regression Results                                
Dep. Variable:     feel_comfortable_listening   R-squared:                       0.627
Model:                                    OLS   Adj. R-squared:                  0.387
Method:                         Least Squares   F-statistic:                     2.613
Date:                        Fri, 09 Jun 2023   Prob (F-statistic):           0.000609
Time:                                13:26:35   Log-Likelihood:                -166.45
No. Observations:                          93   AIC:                             406.9
Df Residuals:                              56   BIC:                             500.6
Df Model:                                  36                                         
Covariance Type:                    nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------