# QUESTION: 
*Can we predict sex with education level and income?*

## Classifiers used: 
* K-Nearest Neighbor
* Support Vector Machine

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import preprocessing  # For normalizing the data

# Classification imports required
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Regression imports required
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

## Getting the data:

In [4]:
df = pd.read_csv('profiles.csv')

#### Cleaning the data (removing nan)
We must get rid of any rows that are missing values so that we can run it through the models without issues

In [5]:
dataset = ['sex', 'income', 'education']

print(df[dataset].isna().any(), '\n')
print(df.income.describe())  # For income na is represented by -1

sex          False
income       False
education     True
dtype: bool 

count      59946.000000
mean       20033.222534
std        97346.192104
min           -1.000000
25%           -1.000000
50%           -1.000000
75%           -1.000000
max      1000000.000000
Name: income, dtype: float64


In [6]:
df = df[df.income != -1] # Only includes rows that dont have -1 income
df.dropna(subset=dataset, inplace=True)

#### Mapping the data
sex, and education are categorical peices of data that can be mapped to ordinal numbers 

In [7]:
mapping_specific = {'graduated from space camp': 3, 
              'graduated from high school': 7, 
              'graduated from two-year college': 11, 
              'graduated from college/university': 15,
              'graduated from masters program': 19, 
              'graduated from law school': 23, 
              'graduated from med school': 23, 
              'graduated from ph.d program': 23, 

              'dropped out of space camp': 0, 
              'dropped out of high school': 4, 
              'dropped out of two-year college': 8, 
              'dropped out of college/university': 12, 
              'dropped out of masters program' : 16, 
              'dropped out of law school': 20, 
              'dropped out of med school':20, 
              'dropped out of ph.d program': 20, 

              'working on space camp':1, 
              'working on high school':5, 
              'working on two-year college': 9,
              'working on college/university': 13,
              'working on masters program': 17, 
              'working on law school' : 21, 
              'working on med school' : 21,
              'working on ph.d program': 21,

              'space camp': 2,
              'high school': 6,
              'two-year college': 10,
              'college/university': 14,
              'masters program': 18,
              'law school': 22, 
              'ph.d program': 22}

# 3 catagories: graduated, working on/nil, and dropped out
mapping_general = {'graduated from space camp': 2, 
              'graduated from high school': 5, 
              'graduated from two-year college': 8, 
              'graduated from college/university': 10,
              'graduated from masters program': 13, 
              'graduated from law school': 16, 
              'graduated from med school': 16, 
              'graduated from ph.d program': 16, 

              'dropped out of space camp': 0, 
              'dropped out of high school': 3, 
              'dropped out of two-year college': 6, 
              'dropped out of college/university': 9, 
              'dropped out of masters program' : 11, 
              'dropped out of law school': 14, 
              'dropped out of med school':14, 
              'dropped out of ph.d program': 14, 

              'working on space camp':1, 
              'working on high school':4, 
              'working on two-year college': 7,
              'working on college/university': 10,
              'working on masters program': 12, 
              'working on law school' : 15, 
              'working on med school' : 15,
              'working on ph.d program': 15,

              'space camp': 1,
              'high school': 4,
              'two-year college': 7,
              'college/university': 10,
              'masters program': 12,
              'law school': 15, 
              'ph.d program': 15}

mapping_general_nospace = {'graduated from space camp': 0, 
                          'graduated from high school': 5, 
                          'graduated from two-year college': 8, 
                          'graduated from college/university': 10,
                          'graduated from masters program': 13, 
                          'graduated from law school': 16, 
                          'graduated from med school': 16, 
                          'graduated from ph.d program': 16, 

                          'dropped out of space camp': 0, 
                          'dropped out of high school': 3, 
                          'dropped out of two-year college': 6, 
                          'dropped out of college/university': 9, 
                          'dropped out of masters program' : 11, 
                          'dropped out of law school': 14, 
                          'dropped out of med school':14, 
                          'dropped out of ph.d program': 14, 

                          'working on space camp':0, 
                          'working on high school':4, 
                          'working on two-year college': 7,
                          'working on college/university': 10,
                          'working on masters program': 12, 
                          'working on law school' : 15, 
                          'working on med school' : 15,
                          'working on ph.d program': 15,

                          'space camp': 0,
                          'high school': 4,
                          'two-year college': 7,
                          'college/university': 10,
                          'masters program': 12,
                          'law school': 15, 
                          'ph.d program': 15}

In [8]:
df['sex_i'] = df.sex.map({'m':0, 'f':1})
df['education_i'] = df.education.map(mapping_general)

#### Normalizing the data
Normalizing data is only important when there are huge gaps in magnitude between feature data. So in this case where all our data is categorical it isn't really necessary.

*We dont really have to normalize here because the features are all categorical and they don't differ by huge magnitudes.*

The syntax for extracting rows from ndarrays is
    `A_NEW = A[start_index_row : stop_index_row,start_index_columnn : stop_index_column)]`

#### Splitting up the data
To get training and testing sets

In [9]:
feature_data = df[['income', 'education_i']]
label = df[['sex_i']]

print(feature_data.shape, label.shape)

train_data, test_data, train_labels, test_labels = train_test_split(feature_data, label, train_size=0.8, random_state=23)

print(train_data.shape, train_labels.shape)

(10783, 2) (10783, 1)
(8626, 2) (8626, 1)


https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected

`.values` will give the values in an array. `(shape: (n,1)`

`.ravel` will convert that array shape to `(n, )`

In [10]:
train_labels = train_labels.values.ravel()
test_labels = test_labels.values.ravel()

# K-Nearest Neighbor
With default parameters n_neighbors=5 we get an accuracy of 70.79%


#### Exploring differant values of k:
With Specific mapping (of education):

`k value: 27 Score: 0.7316`

With General mapping:

`k-value: 12 Score: 0.7320`

In [None]:
score_test = []
score_train = []
k_values = list(range(1, 50, 1))
for k in k_values:
    print(k, end=' ')
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(train_data, train_labels)
    score_train.append(model.score(train_data, train_labels))
    score_test.append(model.score(test_data, test_labels))

max_score = max(score_test)
print('\nBest K-value:', k_values[score_test.index(max_score)])
print('score:', max_score)
    
fig = plt.figure()
ax = plt.subplot(111)
# plt.rcParams['figure.figsize'] = (9, 9)
ax.plot(k_values, score_test, label="test")
ax.plot(k_values, score_train, label='train', c='r')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.10), ncol=2)
plt.ylim(0.6, .9)
# plt.axvline(10, 0, 1, c='g')
plt.show()

# Support Vector Machine
With default parameters we get a score of 0.7246


#### Exploring other parameters:
`Best score: 0.7273991655076495
	C: 0.1560689655172414 
	gamma: 0.0526896551724138`

In [23]:
c_range = list(np.linspace(0.001, 1.5, 30))
g_range = list(np.linspace(0.001, 1.5, 30))
scores = []
points = []

for c in c_range:
    print(c)
    for g in g_range:
        model = SVC(C=c, gamma=g)
        model.fit(train_data, train_labels)
        scores.append(model.score(test_data, test_labels))
        points.append((g, c))
        print('\t', g)

best_score = max(scores)
best_point = points[scores.index(best_score)]
print('Best score:', best_score)
print('\tC: {} \n\tgamma: {}'.format(best_point[0], best_point[1]))


0.001
	 0.001
	 0.0526896551724138
	 0.1043793103448276
	 0.1560689655172414
	 0.2077586206896552
	 0.259448275862069
	 0.3111379310344828
	 0.3628275862068966
	 0.4145172413793104
	 0.4662068965517242
	 0.517896551724138
	 0.5695862068965518
	 0.6212758620689656
	 0.6729655172413794
	 0.7246551724137932
	 0.776344827586207
	 0.8280344827586208
	 0.8797241379310345
	 0.9314137931034484
	 0.9831034482758622
	 1.0347931034482758
	 1.0864827586206898
	 1.1381724137931035
	 1.1898620689655173
	 1.241551724137931
	 1.2932413793103448
	 1.3449310344827587
	 1.3966206896551725
	 1.4483103448275862
	 1.5
0.0526896551724138
	 0.001
	 0.0526896551724138
	 0.1043793103448276
	 0.1560689655172414
	 0.2077586206896552
	 0.259448275862069
	 0.3111379310344828
	 0.3628275862068966
	 0.4145172413793104
	 0.4662068965517242
	 0.517896551724138
	 0.5695862068965518
	 0.6212758620689656
	 0.6729655172413794
	 0.7246551724137932
	 0.776344827586207
	 0.8280344827586208
	 0.8797241379310345
	 0.93141379310

	 0.4662068965517242
	 0.517896551724138
	 0.5695862068965518
	 0.6212758620689656
	 0.6729655172413794
	 0.7246551724137932
	 0.776344827586207
	 0.8280344827586208
	 0.8797241379310345
	 0.9314137931034484
	 0.9831034482758622
	 1.0347931034482758
	 1.0864827586206898
	 1.1381724137931035
	 1.1898620689655173
	 1.241551724137931
	 1.2932413793103448
	 1.3449310344827587
	 1.3966206896551725
	 1.4483103448275862
	 1.5
0.7246551724137932
	 0.001
	 0.0526896551724138
	 0.1043793103448276
	 0.1560689655172414
	 0.2077586206896552
	 0.259448275862069
	 0.3111379310344828
	 0.3628275862068966
	 0.4145172413793104
	 0.4662068965517242
	 0.517896551724138
	 0.5695862068965518
	 0.6212758620689656
	 0.6729655172413794
	 0.7246551724137932
	 0.776344827586207
	 0.8280344827586208
	 0.8797241379310345
	 0.9314137931034484
	 0.9831034482758622
	 1.0347931034482758
	 1.0864827586206898
	 1.1381724137931035
	 1.1898620689655173
	 1.241551724137931
	 1.2932413793103448
	 1.3449310344827587
	 1.3966

	 0.9314137931034484
	 0.9831034482758622
	 1.0347931034482758
	 1.0864827586206898
	 1.1381724137931035
	 1.1898620689655173
	 1.241551724137931
	 1.2932413793103448
	 1.3449310344827587
	 1.3966206896551725
	 1.4483103448275862
	 1.5
1.3966206896551725
	 0.001
	 0.0526896551724138
	 0.1043793103448276
	 0.1560689655172414
	 0.2077586206896552
	 0.259448275862069
	 0.3111379310344828
	 0.3628275862068966
	 0.4145172413793104
	 0.4662068965517242
	 0.517896551724138
	 0.5695862068965518
	 0.6212758620689656
	 0.6729655172413794
	 0.7246551724137932
	 0.776344827586207
	 0.8280344827586208
	 0.8797241379310345
	 0.9314137931034484
	 0.9831034482758622
	 1.0347931034482758
	 1.0864827586206898
	 1.1381724137931035
	 1.1898620689655173
	 1.241551724137931
	 1.2932413793103448
	 1.3449310344827587
	 1.3966206896551725
	 1.4483103448275862
	 1.5
1.4483103448275862
	 0.001
	 0.0526896551724138
	 0.1043793103448276
	 0.1560689655172414
	 0.2077586206896552
	 0.259448275862069
	 0.311137931034