In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# %matplotlib inline

pd.options.display.max_rows = 2000
pd.options.display.max_colwidth = 255

df = pd.read_csv('./assets/survey_results_public.csv')
schema = pd.read_csv('./assets/survey_results_schema.csv')

DUMMY_NA = True
FILL="Mode"


In [2]:
# use respondent as the index
if 'Respondent' in df.columns:    
    df.set_index('Respondent')

# Exclude CompTotal as it is pre converted and normalised to annual USD , which is stored in ConvertedComp
if 'CompTotal' in df.columns:
    df.drop(columns=['CompTotal'], inplace=True)
    
# convert these near numerics to a numeric Series
from helpers import convert_age_series_to_numeric
df['YearsCode'] = df['YearsCode'].map(convert_age_series_to_numeric)
df['YearsCodePro'] = df['YearsCodePro'].map(convert_age_series_to_numeric)
df['Age1stCode'] = df['Age1stCode'].map(convert_age_series_to_numeric)

country_subset = df.query("Country in ['United States', 'Canada', 'United Kingdom', 'Australia', 'New Zealand']")[[
    'Country',
    'Age',
    'Age1stCode',
    'ConvertedComp',
    'Employment',
    'JobSat',
    'JobSeek',
    'MainBranch',
    'NEWEdImpt',
    'OpSys',
    'OrgSize',
    'UndergradMajor',
    'WorkWeekHrs',
    'YearsCode',
    'YearsCodePro'
]]

categorical_only_columns = country_subset.select_dtypes(include='object').columns
for var in categorical_only_columns:
    if var == 'Country':
        continue
    # for each cat add dummy var, drop original column
    country_subset = pd.concat([country_subset.drop(var, axis=1), pd.get_dummies(country_subset[var], prefix=var, prefix_sep='_', drop_first=True, dummy_na=DUMMY_NA)], axis=1)    

In [3]:
fill_mean = lambda col: col.fillna(col.mean())
fill_mode = lambda col: col.fillna(col.mode()[0])
fill = fill_mean if FILL == "Mean" else fill_mode
country_subset['YearsCodePro'] = fill(country_subset['YearsCodePro'])
country_subset['Age'] = fill(country_subset['Age'])
country_subset['Age1stCode'] = fill(country_subset['Age1stCode'])
country_subset['ConvertedComp'] = fill(country_subset['ConvertedComp'])
country_subset['WorkWeekHrs'] = fill(country_subset['WorkWeekHrs'])
country_subset['YearsCode'] = fill(country_subset['YearsCode'])
country_subset['YearsCodePro'] = fill(country_subset['YearsCodePro'])


In [4]:
is_usa = lambda country: 1 if country == 'United States' else 0
is_south_hemi = lambda country: 1 if country in ['Australia', 'New Zealand'] else 0
country_subset['is_south_hemi'] = country_subset['Country'].map(is_south_hemi)
country_subset['is_usa'] = country_subset['Country'].map(is_usa)

def count_ones_ratio (arr):
    s = pd.Series(arr)
    return s[s == 1].shape[0] / arr.size


# Attempt to predict is_usa
X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_usa'
X = country_subset[X_columns]
y = country_subset[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
LR = LogisticRegression(random_state=42, solver='lbfgs', multi_class='ovr')
LR.fit(X_train, y_train)
score = LR.score(X_test, y_test)
y_test_preds = LR.predict(X_test)
print("predicting is_usa using LC: r-squared: {}, score: {}, is_one_ratio: {}, test size {}, train size {}, feature size {}.".format( \
    round(r2_score(y_test, y_test_preds),3), \
    round(score,3), \
    count_ones_ratio(y_test_preds), \
    len(y_test), \
    len(y_train), \
    X.shape[1] \
))

# Attempt to predict is_south_hemi
X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_south_hemi'
X = country_subset[X_columns]
y = country_subset[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
LR = LogisticRegression(random_state=42, solver='lbfgs', multi_class='ovr')
LR.fit(X_train, y_train)
score = LR.score(X_test, y_test)
y_test_preds = LR.predict(X_test)
print("predicting is_south_hemi using LC: r-squared: {}, score: {}, is_one_ratio: {}, test size {}, train size {}, feature size {}.".format( \
    round(r2_score(y_test, y_test_preds),3), \
    round(score,3), \
    count_ones_ratio(y_test_preds), \
    len(y_test), \
    len(y_train), \
    X.shape[1] \
))

# Attempt to predict is_south_hemi without UK
X_columns = [ elem for elem in country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_south_hemi'
no_uk = country_subset[country_subset['Country'] != 'United Kingdom']
X = no_uk[X_columns]
y = no_uk[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
LR = LogisticRegression(random_state=42, solver='lbfgs', multi_class='ovr')
LR.fit(X_train, y_train)
score = LR.score(X_test, y_test)
y_test_preds = LR.predict(X_test)
print("predicting is_south_hemi w/ no uk using LC: r-squared: {}, score: {}, is_one_ratio: {}, test size {}, train size {}, feature size {}.".format( \
    round(r2_score(y_test, y_test_preds),3), \
    round(score,3), \
    count_ones_ratio(y_test_preds), \
    len(y_test), \
    len(y_train), \
    X.shape[1] \
))

# #dummy_na=False fill=mean
# predicting is_usa using LC: r-squared: -0.629, score: 0.614, is_one_ratio: 1.0, test size 6036, train size 14081, feature size 48.
# predicting is_south_hemi using LC: r-squared: -0.087, score: 0.92, is_one_ratio: 0.0, test size 6036, train size 14081, feature size 48.
# predicting is_south_hemi w/ no uk using LC: r-squared: -0.108, score: 0.902, is_one_ratio: 0.0, test size 4867, train size 11354, feature size 48.

# #dummy_na=True fill=mode
# predicting is_usa using LC: r-squared: -0.629, score: 0.614, is_one_ratio: 1.0, test size 6036, train size 14081, feature size 56.
# predicting is_south_hemi using LC: r-squared: -0.087, score: 0.92, is_one_ratio: 0.0, test size 6036, train size 14081, feature size 56.
# predicting is_south_hemi w/ no uk using LC: r-squared: -0.108, score: 0.902, is_one_ratio: 0.0, test size 4867, train size 11354, feature size 56.


predicting is_usa using LC: r-squared: -0.629, score: 0.614, is_one_ratio: 1.0, test size 6036, train size 14081, feature size 56.
predicting is_south_hemi using LC: r-squared: -0.087, score: 0.92, is_one_ratio: 0.0, test size 6036, train size 14081, feature size 56.
predicting is_south_hemi w/ no uk using LC: r-squared: -0.108, score: 0.902, is_one_ratio: 0.0, test size 4867, train size 11354, feature size 56.


In [5]:
# try to filter by employement status

employed_country_subset = df \
    .query("Country in ['United States', 'Canada', 'United Kingdom', 'Australia', 'New Zealand']") \
    .query("Employment in ['Employed full-time', 'Employed part-time', 'Independent contractor, freelancer, or self-employed']")[[
        'Country',
        'Age',
        'Age1stCode',
        'ConvertedComp',
        'JobSat',
        'JobSeek',
        'MainBranch',
        'NEWEdImpt',
        'OpSys',
        'OrgSize',
        'UndergradMajor',
        'WorkWeekHrs',
        'YearsCode',
        'YearsCodePro'
    ]]

In [6]:
categorical_only_columns = employed_country_subset.select_dtypes(include='object').columns
for var in categorical_only_columns:
    if var == 'Country':
        continue
    # for each cat add dummy var, drop original column
    employed_country_subset = pd.concat([employed_country_subset.drop(var, axis=1), pd.get_dummies(employed_country_subset[var], prefix=var, prefix_sep='_', drop_first=True, dummy_na=DUMMY_NA)], axis=1)    
    
fill_mean = lambda col: col.fillna(col.mean())
fill_mode = lambda col: col.fillna(col.mode()[0])
fill = fill_mean if FILL == "Mean" else fill_mode
employed_country_subset['YearsCodePro'] = fill(employed_country_subset['YearsCodePro'])
employed_country_subset['Age'] = fill(employed_country_subset['Age'])
employed_country_subset['Age1stCode'] = fill(employed_country_subset['Age1stCode'])
employed_country_subset['ConvertedComp'] = fill(employed_country_subset['ConvertedComp'])
employed_country_subset['WorkWeekHrs'] = fill(employed_country_subset['WorkWeekHrs'])
employed_country_subset['YearsCode'] = fill(employed_country_subset['YearsCode'])
employed_country_subset['YearsCodePro'] = fill(employed_country_subset['YearsCodePro'])    

In [7]:
is_usa = lambda country: 1 if country == 'United States' else 0
is_south_hemi = lambda country: 1 if country in ['Australia', 'New Zealand'] else 0
employed_country_subset['is_south_hemi'] = employed_country_subset['Country'].map(is_south_hemi)
employed_country_subset['is_usa'] = employed_country_subset['Country'].map(is_usa)

# Attempt to predict is_usa
X_columns = [ elem for elem in employed_country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_usa'
X = employed_country_subset[X_columns]
y = employed_country_subset[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
LR = LogisticRegression(random_state=42, solver='lbfgs', multi_class='ovr')
LR.fit(X_train, y_train)
score = LR.score(X_test, y_test)
y_test_preds = LR.predict(X_test)
print("predicting is_usa using LC: r-squared: {}, score: {}, is_one_ratio: {}, test size {}, train size {}, feature size {}.".format( \
    round(r2_score(y_test, y_test_preds),3), \
    round(score,3), \
    count_ones_ratio(y_test_preds), \
    len(y_test), \
    len(y_train), \
    X.shape[1] \
))

# Attempt to predict is_south_hemi
X_columns = [ elem for elem in employed_country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_south_hemi'
X = employed_country_subset[X_columns]
y = employed_country_subset[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
LR = LogisticRegression(random_state=42, solver='lbfgs', multi_class='ovr')
LR.fit(X_train, y_train)
score = LR.score(X_test, y_test)
y_test_preds = LR.predict(X_test)
print("predicting is_south_hemi using LC: r-squared: {}, score: {}, is_one_ratio: {}, test size {}, train size {}, feature size {}.".format( \
    round(r2_score(y_test, y_test_preds),3), \
    round(score,3), \
    count_ones_ratio(y_test_preds), \
    len(y_test), \
    len(y_train), \
    X.shape[1] \
))
# Attempt to predict is_south_hemi without UK
X_columns = [ elem for elem in employed_country_subset.columns if elem not in ['Country', 'is_usa', 'is_south_hemi']]
y_column = 'is_south_hemi'
no_uk = employed_country_subset[employed_country_subset['Country'] != 'United Kingdom']
X = no_uk[X_columns]
y = no_uk[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
LR = LogisticRegression(random_state=42, solver='lbfgs', multi_class='ovr')
LR.fit(X_train, y_train)
score = LR.score(X_test, y_test)
y_test_preds = LR.predict(X_test)
print("predicting is_south_hemi w/ no uk using LC: r-squared: {}, score: {}, is_one_ratio: {}, test size {}, train size {}, feature size {}.".format( \
    round(r2_score(y_test, y_test_preds),3), \
    round(score,3), \
    count_ones_ratio(y_test_preds), \
    len(y_test), \
    len(y_train), \
    X.shape[1] \
))

#dummy_na=False fill=mean
# predicting is_usa using LC: r-squared: -0.632, score: 0.613, is_one_ratio: 1.0, test size 5220, train size 12177, feature size 42.
# predicting is_south_hemi using LC: r-squared: -0.081, score: 0.925, is_one_ratio: 0.0, test size 5220, train size 12177, feature size 42.
# predicting is_south_hemi w/ no uk using LC: r-squared: -0.102, score: 0.907, is_one_ratio: 0.0, test size 4185, train size 9763, feature size 42.

#dummy_na=True fill=mode
# predicting is_usa using LC: r-squared: -0.632, score: 0.613, is_one_ratio: 1.0, test size 5220, train size 12177, feature size 49.
# predicting is_south_hemi using LC: r-squared: -0.081, score: 0.925, is_one_ratio: 0.0, test size 5220, train size 12177, feature size 49.
# predicting is_south_hemi w/ no uk using LC: r-squared: -0.102, score: 0.907, is_one_ratio: 0.0, test size 4185, train size 9763, feature size 49.

predicting is_usa using LC: r-squared: -0.632, score: 0.613, is_one_ratio: 1.0, test size 5220, train size 12177, feature size 49.
predicting is_south_hemi using LC: r-squared: -0.081, score: 0.925, is_one_ratio: 0.0, test size 5220, train size 12177, feature size 49.
predicting is_south_hemi w/ no uk using LC: r-squared: -0.102, score: 0.907, is_one_ratio: 0.0, test size 4185, train size 9763, feature size 49.
