In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# %matplotlib inline

pd.options.display.max_rows = 2000
pd.options.display.max_colwidth = 255

df = pd.read_csv('./assets/survey_results_public.csv')
schema = pd.read_csv('./assets/survey_results_schema.csv')

In [7]:
# use respondent as the index
if 'Respondent' in df.columns:    
    df.set_index('Respondent')

# Exclude CompTotal as it is pre converted and normalised to annual USD , which is stored in ConvertedComp
if 'CompTotal' in df.columns:
    df.drop(columns=['CompTotal'], inplace=True)
    
# convert these near numerics to a numeric Series
from helpers import convert_age_series_to_numeric
df['YearsCode'] = df['YearsCode'].map(convert_age_series_to_numeric)
df['YearsCodePro'] = df['YearsCodePro'].map(convert_age_series_to_numeric)
df['Age1stCode'] = df['Age1stCode'].map(convert_age_series_to_numeric)

# country_subset = df.query("Country in ['United States', 'Canada', 'Australia', 'United Kingdom', 'New Zealand']")[[
# country_subset = df.query("Country in ['United States', 'Canada']")[[
country_subset = df.query("Country in ['United States', 'Canada', 'Australia', 'New Zealand']")[[
    'Country',
    'Age',
    'Age1stCode',
    'ConvertedComp',
    'Employment',
    'JobSat',
    'JobSeek',
    'MainBranch',
    'NEWEdImpt',
    'OpSys',
    'OrgSize',
    'UndergradMajor',
    'WorkWeekHrs',
    'YearsCode',
    'YearsCodePro'
]].dropna()    

7858

In [12]:
x = df.query("Country in ['United States', 'Canada', 'Australia', 'New Zealand']")[[
    'Country',
    'Age',
    'Age1stCode',
    'ConvertedComp',
    'Employment',
    'JobSat',
    'JobSeek',
    'MainBranch',
    'NEWEdImpt',
    'OpSys',
    'OrgSize',
    'UndergradMajor',
    'WorkWeekHrs',
    'WorkWeekHrs',
    'YearsCode',
    'YearsCodePro'
]]

In [14]:
(x.isna().sum() / x.shape[0]).sort_values()

Country           0.000000
MainBranch        0.002774
Employment        0.004439
Age1stCode        0.056470
YearsCode         0.058443
OpSys             0.083965
UndergradMajor    0.146415
JobSeek           0.146662
NEWEdImpt         0.171321
YearsCodePro      0.209605
JobSat            0.227360
Age               0.229826
OrgSize           0.244005
WorkWeekHrs       0.276493
WorkWeekHrs       0.276493
ConvertedComp     0.354417
dtype: float64

In [12]:
cat_x = x.select_dtypes(include='object')
(cat_x.isna().sum() / cat_x.shape[0]).sort_values()

Country           0.000000
MainBranch        0.002774
Employment        0.004439
OpSys             0.083965
UndergradMajor    0.146415
JobSeek           0.146662
NEWEdImpt         0.171321
JobSat            0.227360
OrgSize           0.244005
dtype: float64

In [13]:
num_x = x.select_dtypes(exclude='object')
(num_x.isna().sum() / num_x.shape[0]).sort_values()

Age1stCode       0.056470
YearsCode        0.058443
YearsCodePro     0.209605
Age              0.229826
WorkWeekHrs      0.276493
WorkWeekHrs      0.276493
ConvertedComp    0.354417
dtype: float64

In [3]:
country_subset.shape

(8363, 16)

In [4]:
is_usa = lambda country: 1 if country == 'United States' else 0
is_south_hemi = lambda country: 1 if country in ['Australia', 'New Zealand'] else 0
#country_subset['is_usa_numeric'] = country_subset['Country'].map(is_usa)
country_subset['is_south_hemi'] = country_subset['Country'].map(is_usa)
country_subset['is_south_hemi'].value_counts()

1    6504
0    1859
Name: is_south_hemi, dtype: int64

In [5]:
categorical_only_columns = country_subset.select_dtypes(include='object').columns
for var in categorical_only_columns:
    if var == 'Country':
        continue
    # for each cat add dummy var, drop original column
    country_subset = pd.concat([country_subset.drop(var, axis=1), pd.get_dummies(country_subset[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)

In [6]:
# X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_usa_numeric']]
# y_column = 'is_usa_numeric'

X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_south_hemi']]
y_column = 'is_south_hemi'



X = country_subset[X_columns]
y = country_subset[y_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
#Predict and score the model
y_test_preds = lm_model.predict(X_test) 
"The r-squared score for the model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))


'The r-squared score for the model was 0.045168155286912315 on 2509 values.'