In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# %matplotlib inline

pd.options.display.max_rows = 2000
pd.options.display.max_colwidth = 255

df = pd.read_csv('./assets/survey_results_public.csv')
schema = pd.read_csv('./assets/survey_results_schema.csv')

In [2]:
# use respondent as the index
if 'Respondent' in df.columns:    
    df.set_index('Respondent')

# Exclude CompTotal as it is pre converted and normalised to annual USD , which is stored in ConvertedComp
if 'CompTotal' in df.columns:
    df.drop(columns=['CompTotal'], inplace=True)
    
# convert these near numerics to a numeric Series
from helpers import convert_age_series_to_numeric
df['YearsCode'] = df['YearsCode'].map(convert_age_series_to_numeric)
df['YearsCodePro'] = df['YearsCodePro'].map(convert_age_series_to_numeric)
df['Age1stCode'] = df['Age1stCode'].map(convert_age_series_to_numeric)

# country_subset = df.query("Country in ['United States', 'Canada', 'Australia', 'United Kingdom', 'New Zealand']")[[
# country_subset = df.query("Country in ['United States', 'Canada']")[[
country_subset = df.query("Country in ['United States', 'Canada', 'United Kingdom', 'Australia', 'New Zealand']")[[
    'Country',
    'Age',
    'Age1stCode',
    'ConvertedComp',
    'Employment',
    'JobSat',
    'JobSeek',
    'MainBranch',
    'NEWEdImpt',
    'OpSys',
    'OrgSize',
    'UndergradMajor',
    'WorkWeekHrs',
    'YearsCode',
    'YearsCodePro'
]].dropna()

categorical_only_columns = country_subset.select_dtypes(include='object').columns
for var in categorical_only_columns:
    if var == 'Country':
        continue
    # for each cat add dummy var, drop original column
    country_subset = pd.concat([country_subset.drop(var, axis=1), pd.get_dummies(country_subset[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)    

In [3]:
is_usa = lambda country: 1 if country == 'United States' else 0
is_south_hemi = lambda country: 1 if country in ['Australia', 'New Zealand'] else 0
country_subset['is_south_hemi'] = country_subset['Country'].map(is_south_hemi)
country_subset['is_usa'] = country_subset['Country'].map(is_usa)

# Attempt to predict is_usa
X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_usa'
X = country_subset[X_columns]
y = country_subset[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
y_test_preds = lm_model.predict(X_test) #Predict and score the model 
print("predicting is_usa: r-squared: {}, test size {}, train size {}, feature size {}.".format(round(r2_score(y_test, y_test_preds),3), len(y_test), len(y_train), X.shape[1]))

# Attempt to predict is_south_hemi
X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_south_hemi'
X = country_subset[X_columns]
y = country_subset[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
y_test_preds = lm_model.predict(X_test) #Predict and score the model 
print("predicting is_south_hemi: r-squared: {}, test size {}, train size {}, feature size {}.".format(round(r2_score(y_test, y_test_preds),3), len(y_test), len(y_train), X.shape[1]))

# Attempt to predict is_south_hemi without UK
X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_south_hemi'
no_uk = country_subset[country_subset['Country'] != 'United Kingdom']
X = no_uk[X_columns]
y = no_uk[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
y_test_preds = lm_model.predict(X_test) #Predict and score the model 
print("predicting is_south_hemi w/ no uk: r-squared: {}, test size {}, train size {}, feature size {}.".format(round(r2_score(y_test, y_test_preds),3), len(y_test), len(y_train), X.shape[1]))

#dummy_na = False - no fill_mean
# predicting is_usa: r-squared: 0.086, test size 3123, train size 7287, feature size 44.
# predicting is_south_hemi: r-squared: 0.016, test size 3123, train size 7287, feature size 44.
# predicting is_south_hemi w/ no uk: r-squared: 0.013, test size 2509, train size 5854, feature size 44.


predicting is_usa: r-squared: 0.086, test size 3123, train size 7287, feature size 44.
predicting is_south_hemi: r-squared: 0.016, test size 3123, train size 7287, feature size 44.
predicting is_south_hemi w/ no uk: r-squared: 0.013, test size 2509, train size 5854, feature size 44.
