In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import copy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score

%matplotlib inline

In [5]:
# styling
pd.set_option('display.max_columns',150)
plt.style.use('bmh')
from IPython.display import display

In [6]:
# Columns contain the data headers and responses contains the actual data

columns = pd.read_csv(r'C:\Users\jmpl\Documents\Data_Science\Kaggle\young-people-survey\columns.csv')
young = pd.read_csv(r'C:\Users\jmpl\Documents\Data_Science\Kaggle\young-people-survey\responses.csv')

FileNotFoundError: [Errno 2] File b'C:\\Users\\jmpl\\Documents\\Data_Science\\Kaggle\\young-people-survey\\columns.csv' does not exist: b'C:\\Users\\jmpl\\Documents\\Data_Science\\Kaggle\\young-people-survey\\columns.csv'

In [None]:
young.describe()

In [None]:
print(young['Age'].mean())
print(young['Age'].std())

In [None]:
sns.countplot(young['Age'])

In [None]:
# Distinguish young and old from and less 20 and over 20
# Looking for inferences which distinguish the young generation (< 20) and the older generation (>= 20)
# Expecting differences to be around social behaviour: partying & socialising etc., Smoking & Drinking, No. of friends. Mood changes., Different spending habits, education level received

print(len(young['Gender']))

In [None]:
# Age either younger than 20 or 20 or over
young['Under 20'] = np.where(young['Age'] < 20, 'Younger', 'Older') 

print(young['Under 20'].head())

# seriesObj = young.apply(lambda x: True if x['Age'] > 19 else False, axis = 1)

num_under20 = len(young['Under 20'][young['Under 20'] ==  'Younger'].index)
print(num_under20)

num_over20 = len(young['Under 20'][young['Under 20'] ==  'Older'].index)
print(num_over20)

In [None]:
var_of_interest = 'Under 20'
mapping = {var_of_interest: {'Younger': 0, 'Older': 1}}
young.dropna(subset=[var_of_interest], inplace=True)
# to be able to use hue parameter for better comparison in seaborn
young["all"] = ""



In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (15,6))
sns.countplot(x = var_of_interest, data = young, ax = ax[0])
sns.countplot(x = var_of_interest, hue = 'Gender' , data = young, ax = ax[1])


In [None]:
# Looking at the difference in proportion of male and female for the age groups

data = young
sns.violinplot(x = 'Age', y = 'all', hue = 'Gender', data = data, split = True)

In [None]:
def do_ploting(x, y, figsize):
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_title("Correlation coefficient of the variables")
    sns.barplot(x=x, y=y, ax=ax)
    ax.set_ylabel("Correlation coefficients")


def correlation_plot(var_of_interest, df_main, mapping, figsize=(10, 30)):
    def calc_corr(var_of_interest, df, cols, figsize):
        lbls = []
        vals = []
        for col in cols:
            lbls.append(col)
            vals.append(np.corrcoef(df[col], df[var_of_interest])[0, 1])
        corrs = pd.DataFrame({'features': lbls, 'corr_values': vals})
        corrs = corrs.sort_values(by='corr_values')
        do_ploting(corrs.corr_values, corrs['features'], figsize)
        return corrs

    #imputing the set
    df = copy.deepcopy(df_main)
    df.replace(mapping, inplace=True)
    mean_values = df.mean(axis=0)
    df.fillna(mean_values, inplace=True)

    #correlating non-categorical varibales
    cols_floats = [col for col in df.columns if df[col].dtype != 'object']
    cols_floats.remove(var_of_interest)
    corrs_one = calc_corr(var_of_interest, df, cols_floats, figsize)

    #correlating categorical variables
    cols_cats = [col for col in df.columns if df[col].dtype == 'object']
    if cols_cats:
        df_dummies = pd.get_dummies(df[cols_cats])
        cols_cats = df_dummies.columns
        df_dummies[var_of_interest] = df[var_of_interest]
        corrs_two = calc_corr(var_of_interest, df_dummies, cols_cats, (5, 10))
    else:
        corrs_two = 0
    return [corrs_one, corrs_two]

In [None]:
# Drop age for correlation
del young['Age']

corrs_area = correlation_plot(var_of_interest, young, mapping)

In [None]:
# Reducing multicolinearity, change for other
corr = young.corr()
os = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)).stack().sort_values(ascending=False))

# drop_colinera_cols = os[abs(os)>0.5].reset_index()['level_1']

In [None]:
# Machine Learning Used Here
# Data preprocessing

features_int = [col for col in young.columns if young[col].dtype != 'object']

features_cats = [col for col in young.columns if young[col].dtype == 'object']
print(len(features_cats))
print(len(features_int))


print(features_cats)
print("\n")

print(features_int)



# features_int = list(set(features_int) - set(drop_colinera_cols))
features_int = list(set(features_int))
print(len(features_int))


In [None]:
X = young[features_int]
mean_values = X.mean(axis=0)
X = X.apply(lambda x: x.fillna(x.mean()), axis=0)

In [None]:
Y = young[var_of_interest]

for key, val in mapping[var_of_interest].items():
    Y.replace(key,val, inplace = True)
    
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 100)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# gridsearch for parameter tuning
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
KF = KFold(n_splits = 5)
param_grid = {'C':[.001,.01,.03,.1,.3,1,3,10]}
grsearch = GridSearchCV(clr, param_grid=param_grid, cv=KF, scoring = 'f1')
grsearch.fit(x_train, y_train)
print(grsearch.best_params_)

# fitting logistic regression and evaluating
clr = LogisticRegression(C=grsearch.best_params_['C'])
clr.fit(x_train, y_train)

mean_accuracy = np.mean(cross_val_score(clr, x_train, y_train, cv=KF))
print('Average accuracy score on CV set: {:.2f}'.format(mean_accuracy))

mean_f1 = np.mean(cross_val_score(clr, x_train, y_train, cv=KF, scoring = 'f1'))
print('Average f1 on CV set: {:.2f}'.format(mean_f1))
print('')
print('Accuracy score on test set is: {:.2f}'.format(clr.score(x_test, y_test)))
recall = recall_score(y_test, clr.predict(x_test))
print ('Recall on test: {:.2f}'.format(recall))
precision = precision_score(y_test, clr.predict(x_test))
print ('Presicion on test: {:.2f}'.format(precision))
print ('F1 score on test: {:.2f}'.format((2*recall*precision /(recall + precision))))


In [None]:
# gridsearch for parameter tuning
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
KF = KFold(len(x_train)) # LOOCV
param_grid = {'C':[.001,.01,.03,.1,.3,1,3,10]}
grsearch = GridSearchCV(clr, param_grid=param_grid, cv=KF, scoring = 'f1')
grsearch.fit(x_train, y_train)
print(grsearch.best_params_)

# fitting logistic regression and evaluating
clr = LogisticRegression(C=grsearch.best_params_['C'])
clr.fit(x_train, y_train)

mean_accuracy = np.mean(cross_val_score(clr, x_train, y_train, cv=KF))
print('Average accuracy score on CV set: {:.2f}'.format(mean_accuracy))

mean_f1 = np.mean(cross_val_score(clr, x_train, y_train, cv=KF, scoring = 'f1'))
print('Average f1 on CV set: {:.2f}'.format(mean_f1))
print('')
print('Accuracy score on test set is: {:.2f}'.format(clr.score(x_test, y_test)))
recall = recall_score(y_test, clr.predict(x_test))
print ('Recall on test: {:.2f}'.format(recall))
precision = precision_score(y_test, clr.predict(x_test))
print ('Presicion on test: {:.2f}'.format(precision))
print ('F1 score on test: {:.2f}'.format((2*recall*precision /(recall + precision))))


In [None]:
feat_coeff = pd.DataFrame({'features': X.columns,'impacts': clr.coef_[0]})
feat_coeff = feat_coeff.sort_values('impacts', ascending=False)

fig, ax1 = plt.subplots(1,1, figsize=(30,6));
sns.barplot(x=feat_coeff.features, y=feat_coeff.impacts, ax=ax1);
ax1.set_title('All features', size=30);
ax1.set_xticklabels(labels=feat_coeff.features, size=20, rotation=90);
ax1.set_ylabel('Impact', size=30);

In [None]:
top10 = pd.concat([feat_coeff.head(6),feat_coeff.tail(6)])
fig, ax1 = plt.subplots(1,1, figsize=(10,6))
sns.barplot(y=top10.features, x=top10.impacts, ax=ax1);
ax1.set_title('Top 12 features', size=20);
ax1.set_yticklabels(labels=top10.features, size=15);
ax1.set_xlabel('Impact', size=20);


In [None]:
# Analysis of factors which have the greatest influence those who drink a lot vs. those who don't drink or only socially drink

clean_data = young.dropna(subset=['Alcohol'])
features_int = [col for col in clean_data.columns if clean_data[col].dtype!='object']
X = clean_data[features_int]
mean_values = X.mean(axis=0)
X = X.apply(lambda x: x.fillna(x.mean()),axis=0)
Y = clean_data['Alcohol']
Y.replace('never',0, inplace = True)
Y.replace('social drinker',0, inplace = True)
Y.replace('drink a lot',1, inplace = True)

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

clr = LogisticRegression()
clr.fit(X, Y)
feat_coeff = pd.DataFrame({'features': features_int,'impacts': clr.coef_[0]})
feat_coeff = feat_coeff.sort_values('impacts', ascending=False)
feat_coeff.drop([73], inplace = True)

top10 = pd.concat([feat_coeff.head(8),feat_coeff.tail(8)])
fig, ax1 = plt.subplots(1,1, figsize=(10,6))
sns.barplot(y=top10.features, x=top10.impacts, ax=ax1);
ax1.set_title('Top 16 features', size=20);
ax1.set_yticklabels(labels=top10.features, size=15);
ax1.set_xlabel('Impact', size=20);

In [None]:
# Analysis of factors which have the greatest influence those who smoke or have smoked vs. those who don't smoke

clean_data = young.dropna(subset=['Smoking'])
features_int = [col for col in clean_data.columns if clean_data[col].dtype!='object']
X = clean_data[features_int]
mean_values = X.mean(axis=0)
X = X.apply(lambda x: x.fillna(x.mean()),axis=0)
Y = clean_data['Smoking']
Y.replace('never smoked',0, inplace = True)
Y.replace('tried smoking',1, inplace = True)
Y.replace('current smoker',1, inplace = True)
Y.replace('current smoker',1, inplace = True)
Y.replace('former smoker',1, inplace = True)


scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

clr = LogisticRegression()
clr.fit(X, Y)
feat_coeff = pd.DataFrame({'features': features_int,'impacts': clr.coef_[0]})
feat_coeff = feat_coeff.sort_values('impacts', ascending=False)

top10 = pd.concat([feat_coeff.head(8),feat_coeff.tail(8)])
fig, ax1 = plt.subplots(1,1, figsize=(10,6))
sns.barplot(y=top10.features, x=top10.impacts, ax=ax1);
ax1.set_title('Top 16 features', size=20);
ax1.set_yticklabels(labels=top10.features, size=15);
ax1.set_xlabel('Impact', size=20);
