In [None]:
import pandas as pd
import numpy as np
import csv 
import seaborn as sns

In [None]:
# Reading dataset 
df = pd.read_csv('./Adult Data Set/adult.data', index_col = False, header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','salary'])

# Dropping education column as education-num is representative of the same thing
df.drop(['education'], axis = 1,inplace=True)

# The continuous variable fnlwgt represents final weight, which is the number of 
# units in the target population that the responding unit represents. Irrelevant as 
# a feature, and hence being dropped.
df.drop(['fnlwgt'], axis = 1, inplace=True)

# After finding accuracy in the models for logistic regression
#, and naive bayes, it seems that these features do not contribute to the 
# betterment of the accuracy and are therefore being dropped.

df.drop(['workclass'], axis = 1, inplace=True)
df.drop(['marital-status'], axis = 1, inplace=True)
df.drop(['occupation'], axis = 1, inplace=True)
df.drop(['relationship'], axis = 1, inplace=True)
df.drop(['race'], axis = 1, inplace=True)
df.drop(['sex'], axis = 1, inplace=True)
df.drop(['native-country'], axis = 1, inplace=True)

# Converting output to a 1 and 0 for the model to be able to handle
df['salary'] = df['salary'].apply(lambda x: 0 if (x == ' <=50K' ) else 1)

# Replacing the missing values with NaNs and finding missing values
df.replace(' ?', np.nan, inplace=True)
print("-----")


# Finding the distribution of the positive and negative classes and other numerial features 
print(df['salary'].value_counts(normalize=True,dropna=True,ascending=True))
print("-----")
print(df['age'].value_counts(normalize=True,dropna=True,ascending=True))
print("-----")
print(df['education-num'].value_counts(normalize=True,dropna=True,ascending=True))
print("-----")
print(df['capital-gain'].value_counts(normalize=True,dropna=True,ascending=True))
print("-----")
print(df['capital-loss'].value_counts(normalize=True,dropna=True,ascending=True))
print("-----")
print(df['hours-per-week'].value_counts(normalize=True,dropna=True,ascending=True))
print("-----")

# print(df.describe())

# Dropping these features because data is malformed
df.drop(['capital-gain'], axis = 1, inplace=True)
df.drop(['capital-loss'], axis = 1, inplace=True)

# One hot encoding using pandas get_dummies and then dropping 
# one category for each categorial feature to preserve linear dependency
# df['workclass'] = df['workclass'].astype('category')
# df['marital-status'] = df['marital-status'].astype('category')
# df['occupation'] = df['occupation'].astype('category')
# df['relationship'] = df['relationship'].astype('category')
# df['race'] = df['race'].astype('category')
# df['sex'] = df['sex'].astype('category')
# df['native-country'] = df['native-country'].astype('category')
# df = pd.get_dummies(df , drop_first= True)

# Replacing missing values with most frequent value in that column
df = df.apply(lambda x: x.fillna(x.mean()), axis=0)
print("-----")

# Plotting the correlation between features
corr = df.corr()
sns.heatmap(corr)
print("-----")