# Top Model

### Import Preliminaries

In [21]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from tqdm import *

test_data = pd.read_csv('Data/test.csv', low_memory=False)
train_data = pd.read_csv('Data/train.csv', low_memory=False)

dfs = [train_data, test_data]

### Filter DataFrame

In [92]:
train_df_classes = train_data['project_is_approved']

features = ['teacher_prefix', 'school_state', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories',
            'teacher_number_of_previously_posted_projects']

train_df = train_data[features]
test_df = test_data[features]
train_df = train_df.fillna(value='No Essay')
test_df = test_df.fillna(value='No Essay')

### Encoding Values

In [5]:
for col in list(train_df.select_dtypes('object').columns):
    train_df[col] = train_df[col].astype('category')
    train_df[col] = train_df[col].cat.codes
    
for col in list(test_df.select_dtypes('object').columns):
    test_df[col] = test_df[col].astype('category')
    test_df[col] = test_df[col].cat.codes

### Dataframe to Values

In [6]:
y = train_df_classes.values
X = train_df.values
X_test = test_df.values

### Setup Logisical Model

In [10]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Logistical Model

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

crossvalidation = KFold(10, random_state=1)
scores = cross_val_score(model, X, y, 
                scoring = 'accuracy',
                cv = crossvalidation, n_jobs =1)

print ('Folds: %i,accuracy: %.2f std: %.2f' 
% (len(scores),np.mean(np.abs(scores)),np.std(scores)))

Folds: 10,accuracy: 0.85 std: 0.00


### Prediction Model

In [17]:
logpred = pd.Series(model.predict(X_test), name='logistic feature')
logpred.head(5)

0    1
1    1
2    1
3    1
4    1
Name: logistic feature, dtype: int64

### Essay Data

In [31]:
# Taining Data
essay_features = ['project_essay_1','project_essay_2','project_essay_3','project_essay_4','project_resource_summary']

train_df = train_data[essay_features]
test_df = test_data[essay_features]

train_df = train_data[essay_features]
test_df = test_data[essay_features]
train_df = train_df.fillna(value='No Essay')
test_df = test_df.fillna(value='No Essay')

In [32]:
train_df.head(3)

Unnamed: 0,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary
0,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,No Essay,No Essay,My students need 6 Ipod Nano's to create and d...
1,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,No Essay,No Essay,My students need matching shirts to wear for d...
2,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,No Essay,No Essay,My students need the 3doodler. We are an SEM s...


In [52]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_sequence_length = 200
validation_split = 0.1
tokenizer = Tokenizer()

tokenizer.fit_on_texts(train_df.project_essay_1.values)
sequences = tokenizer.texts_to_sequences(train_df.project_essay_1.values)

pd.Series(sequences).sample(5)

118932    [13, 44, 8, 6, 160, 12, 145, 1218, 6224, 2892,...
120896    [61, 37, 4, 290, 118, 15, 28, 912, 11, 7, 203,...
82106     [134, 38, 13, 55, 10, 1931, 4, 20, 7, 152, 3, ...
68253     [71, 5, 10, 143, 4, 32, 25, 77, 106, 474, 2, 7...
137887    [968, 1, 15, 2868, 10, 4, 617, 1631, 22, 7, 40...
dtype: object

In [69]:
word_index = tokenizer.word_index
pd.Series(word_index).sample(5)

disorderly    26416
overseen      27824
‘firsts’      46041
origin         5657
gazes         18095
dtype: int64

In [83]:
essay_one_data = pad_sequences(sequences, maxlen=200)
essay_one_data = pd.DataFrame(data)
essay_one_data.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
20628,0,0,0,0,0,0,0,0,0,0,...,109,561,8,588,48,2,1195,15,562,1053
90516,0,0,0,0,0,0,0,0,0,0,...,1,191,6,12,20,2806,3,86,11,58
101839,0,0,0,0,0,0,0,0,0,0,...,5,462,811,21,23,8,3,28,9,18
13101,0,0,0,0,0,0,0,0,0,0,...,43,361,23,1,237,14,166,279,9,18
148222,0,0,0,0,0,0,0,0,0,0,...,70,2,11,7,6,217,5,15,28,69
143496,0,0,0,0,0,0,0,0,0,0,...,5,20,187,2,67,19,76,8,15,49
108540,0,0,0,0,0,0,0,0,0,0,...,61,37,1,29,3,73,104,13,38,29
172144,0,0,0,0,0,0,0,0,0,0,...,6,5732,42,1226,535,11,7,366,336,2388
69070,0,0,0,0,0,0,0,0,0,0,...,2,707,34,3,545,5,3,12,52,81
118012,0,0,0,0,0,0,0,0,0,0,...,268,11,7,408,1,235,2,207,26,68


In [82]:
train_df.head(5)

Unnamed: 0,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary
0,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,No Essay,No Essay,My students need 6 Ipod Nano's to create and d...
1,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,No Essay,No Essay,My students need matching shirts to wear for d...
2,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,No Essay,No Essay,My students need the 3doodler. We are an SEM s...
3,My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",No Essay,No Essay,My students need balls and other activity equi...
4,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,No Essay,No Essay,My students need a water filtration system for...


In [81]:
train_df_classes.head(5)

0    1
1    0
2    1
3    0
4    1
Name: project_is_approved, dtype: int64

In [103]:
essay_one_data = pd.concat([essay_one_data, train_df_classes], axis=1).head(3)
essay_one_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,project_is_approved
0,0,0,0,0,0,0,0,0,0,0,...,1,156,3,790,22,43,256,25,417,1
1,0,0,0,0,0,0,0,0,0,0,...,1,14,1646,279,151,400,739,1339,719,0
2,0,0,0,0,0,0,0,0,0,0,...,3583,3,4,8,204,1,32,9,18,1


In [104]:
essay_one_features = essay_one_data.columns[:-1]
essay_one_features

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       190, 191, 192, 193, 194, 195, 196, 197, 198, 199],
      dtype='object', length=200)

In [112]:
X = essay_one_data[essay_one_features].values
y = essay_one_data['project_is_approved'].values