In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
import pandas as pd
income = pd.read_csv('income_data.csv')
income.head()

In [0]:
income.shape

In [0]:
assert pd.notnull(income).all().all()

In [0]:
income.info() #get unique data in dataset. seems like no NAs.

In [0]:
income.columns

In [0]:
colNameDict = {' workclass':'workclass',' fnlwgt':'fnlwgt', ' education':'education', ' education-num':'educationnum', ' marital-status':'maritalstatus',
               ' occupation':'occupation', ' relationship':'relationship', ' race':'race', ' sex':'sex', ' capital-gain':'capitalgain', 
               ' capital-loss':'capitalloss', ' hours-per-week':'hoursperweek', ' native-country':'nativecountry', ' income':'income'
               }
income.rename(columns = colNameDict,inplace=True)  #some columne names contain whitespace. replace them
income.columns

In [0]:
income.describe() #check on the numeric columns

In [0]:
import matplotlib.pyplot as plt
income['age'].plot(kind='hist', rot=70, logx=False, logy=False)
plt.show()
#and plot other numeric variables... seems fine.

In [0]:
income[income['hoursperweek'] == 99].head() #there are 85 people work 99 hours perweek. so far looks legit.

In [0]:
%matplotlib inline
pd.crosstab(income['educationnum'],income['income']).plot(kind='bar')
plt.title('Income against Number of years of Education')
plt.xlabel('Education Num')
plt.ylabel('High Salary')

Test Train set split and fitting

In [0]:
import numpy as np

In [0]:
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [0]:
income.income.value_counts()
#approx. plain income : high income = 3:1

In [0]:
sns.countplot(x=income['income'], data= income, palette='hls')
plt.show()
plt.savefig('count_plot')

In [0]:
income.groupby('income').mean()

In [0]:
corrdf=income.corr()
corrdf

In [0]:
corrdf['income'].sort_values(ascending=False)

In [0]:
income.educationnum.hist()
plt.title('Histogram of Number of years of Education')
plt.xlabel('Education Num')
plt.ylabel('Frequency')
plt.savefig('hist_edu')

In [0]:
income.age.hist()
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('hist_age')

In [0]:
income.hoursperweek.hist()
plt.title('Histogram of Hours per week')
plt.xlabel('Hours')
plt.ylabel('Frequency')
plt.savefig('hist_hours')

In [0]:
keepers = ['income', 'age', 'educationnum', 'capitalgain', 'hoursperweek']
data=income[keepers]
data.info()

In [0]:
data.head()

In [0]:
X = data.loc[:, data.columns != 'income']
y = data.loc[:, data.columns == 'income']
from imblearn.over_sampling import SMOTE #apply oversampling technique to generate a balanced dataset
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['income'])
print("length of oversampled data is ",len(os_data_X))
print("Number of plain income in oversampled data",len(os_data_y[os_data_y['income']==0]))
print("Number of high income",len(os_data_y[os_data_y['income']==1]))
print("Proportion of plain income data in oversampled data is ",len(os_data_y[os_data_y['income']==0])/len(os_data_X))
print("Proportion of high income data in oversampled data is ",len(os_data_y[os_data_y['income']==1])/len(os_data_X))

In [0]:
os_data_X = os_data_X.astype(np.int)
os_data_y= os_data_y.astype(np.int)
y = os_data_y
X = os_data_X
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

In [0]:
#model fitting
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [0]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [0]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix) #correction prediction(3837+3773), incorrect prediction(1305+1473)

In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [0]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show() #ROC curve stays above average dotted line. the model is good.

In [0]:
question6 = ['income', 'occupation','hoursperweek']
data6 = income[question6]