# Top Model

### Import Preliminaries

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense

from keras import layers
from keras.optimizers import RMSprop
from keras.layers import Embedding, SimpleRNN, Dense



# Prep Sample File Function
def prep_sample_submission (model_proba):
    model_proba['project_is_approved'] = 0
    for index, row in model_proba.iterrows():
        if model_proba.loc[index,0] > model_proba.loc[index,0]:
            model_proba.loc[index,'project_is_approved'] = model_proba.loc[index,0]
        else:
            model_proba.loc[index,'project_is_approved'] = model_proba.loc[index,1]
    return model_proba

pd.set_option('max_columns',1000)

Using TensorFlow backend.


### Data Processing

In [4]:
# Import Data
test_data = pd.read_csv('Data/test.csv', low_memory=False)
train_data = pd.read_csv('Data/train.csv', low_memory=False)

train_df_classes = train_data['project_is_approved']

# Training Features
features = ['teacher_prefix', 'school_state', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories',
            'teacher_number_of_previously_posted_projects']

# Training and Testing Dataframes
train_df = train_data[features]
test_df = test_data[features]
train_df = train_df.fillna(value='No Essay')
test_df = test_df.fillna(value='No Essay')
test_ids = test_data.id

#Encoding Values
for col in list(train_df.select_dtypes('object').columns):
    train_df[col] = train_df[col].astype('category')
    train_df[col] = train_df[col].cat.codes
    
for col in list(test_df.select_dtypes('object').columns):
    test_df[col] = test_df[col].astype('category')
    test_df[col] = test_df[col].cat.codes
    
y = train_df_classes.values
X = train_df.values
X_test = test_df.values

In [5]:
# Taining Data
essay_features = ['project_essay_1','project_essay_2','project_essay_3','project_essay_4','project_resource_summary']

train_edf = train_data[essay_features]
test_edf = test_data[essay_features]

# Filter Essay Features
train_edf = train_data[essay_features]
test_edf = test_data[essay_features]

# Fill DataFrame
train_edf = train_edf.fillna(value='No Essay')
test_edf = test_edf.fillna(value='No Essay')

## Logisical Model

In [5]:
# Train the Logistic Model
model = LogisticRegression()
model.fit(X,y)

# Predict Sample Probabilities
logpred = pd.Series(model.predict(X_test), name='project_is_approved_result')
logpred_proba_train = pd.DataFrame(model.predict_proba(X))
logpred_proba_test = pd.DataFrame(model.predict_proba(X_test))
logpred_sample_file = prep_sample_submission(logpred_proba_test)
logpred_sample_file = pd.concat([test_ids,logpred_sample_file.project_is_approved], axis=1)
logpred_sample_file.to_csv('Submissions/logistic_regression_submission.csv', index=False)

# Cross Validation Score
crossvalidation = KFold(10, random_state=1)
scores = cross_val_score(model, X, y, 
                scoring = 'accuracy',
                cv = crossvalidation, n_jobs =1)

print ('Folds: %i,accuracy: %.2f std: %.2f' 
% (len(scores),np.mean(np.abs(scores)),np.std(scores)))

logpred_sample_file.head()

Folds: 10,accuracy: 0.85 std: 0.00


Unnamed: 0,id,project_is_approved
0,p233245,0.83723
1,p096795,0.847115
2,p236235,0.808259
3,p233680,0.83627
4,p171879,0.854131


# Single Essay Model
### Preprocessing

In [None]:
# Model Hyper Parameters
max_features = 10000
maxlen = 500
validation_split = 0.2

# Tokenization, Sequences, Word Index, and Word Embeddings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_edf.project_essay_1.values)
train_sequences = tokenizer.texts_to_sequences(train_edf.project_essay_1.values)
word_index = tokenizer.word_index
essay_one_train_data = pad_sequences(train_sequences, maxlen=maxlen)
essay_one_train_data = pd.DataFrame(essay_one_train_data)
essay_one_train_data = pd.concat([essay_one_train_data, train_df_classes], axis=1)
essay_one_train_data.head(10)

# Creating a list of Essay 1 features
essay_one_features = essay_one_train_data.columns[:-1]

# Converting DataFrame Values to Arrays
input_train = essay_one_train_data[essay_one_features].values
y_train = essay_one_train_data['project_is_approved'].values

essay_one_train_data.head(7)

In [None]:
#pd.DataFrame(sequences).describe()

### Single Essay Model: Recurrent RNN in Keras

In [None]:


model = Sequential() 
model.add(Embedding(50000, 32))
model.add(SimpleRNN(32)) 
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) 
model.summary()

In [None]:
history = model.fit(input_train, y_train, epochs=3, batch_size=128, validation_split=0.2)

In [None]:

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')

plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

### Export Results

In [None]:
# Tokenization, Sequences, Word Index, and Word Embeddings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_edf.project_essay_1.values)
test_sequences = tokenizer.texts_to_sequences(test_edf.project_essay_1.values)
essay_one_test_data = pad_sequences(test_sequences, maxlen=maxlen)
essay_one_test_data = pd.DataFrame(essay_one_test_data)
essay_one_test_data.head(10)

# Converting DataFrame Values to Arrays
input_test = essay_one_test_data[essay_one_features].values

essay_one_test_data.head(7)

In [None]:
# Prediction on the test data
essay_one_pred = pd.DataFrame(model.predict(input_test))
essay_one_pred.columns = ['project_is_approved']

# Export Essay One Prediction Sample Files
essay_one_pred_sample_file = pd.concat([test_ids,essay_one_pred], axis=1)
essay_one_pred_sample_file.to_csv('Submissions/Essay_one_predictions.csv', index=False)
essay_one_pred_sample_file.tail(10)

### Removing the Last Layer of Network

In [None]:
model = Sequential() 
model.add(Embedding(50000, 32))
model.add(SimpleRNN(32)) 
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) 
model.summary()

In [None]:
history = model.fit(input_train, y_train, epochs=3, batch_size=128, validation_split=0.2)

### Training a Bidirectional GRU

In [None]:
input_train.shape[-1]

In [None]:
input_train.shape

In [None]:
input_train = input_train[:5000]
y_train = y_train[:5000]
input_test = input_test[:5000]

In [None]:


model = Sequential()
model.add(Embedding(50000, 32))
model.add(layers.Bidirectional(layers.GRU(32),
                              input_shape=(None,input_train.shape[-1],)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(input_train, y_train, epochs=3, batch_size=128, validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')

plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Prediction on test site
essay_one_pred = pd.DataFrame(model.predict(input_test))
essay_one_pred.columns = ['project_is_approved']
essay_one_pred_sample_file = pd.concat([test_ids,essay_one_pred], axis=1)
essay_one_pred_sample_file.to_csv('Submissions/bigru_predictions.csv', index=False)
essay_one_pred_sample_file.tail(10)

# All Essay Model Bi Directional GRU

In [7]:
essay_train_data = pd.DataFrame()
for col in train_edf.columns:
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(train_edf[col].values)
  train_sequences = tokenizer.texts_to_sequences(train_edf[col].values)
  essay_train_data_append = pad_sequences(train_sequences, maxlen=maxlen)
  essay_train_data_append = pd.DataFrame(essay_train_data_append)
  essay_train_data = pd.concat([essay_train_data,essay_train_data_append], axis=1)
  print(col)
  
print(essay_train_data.shape)

NameError: name 'Tokenizer' is not defined

In [None]:
essay_test_data = pd.DataFrame()
for col in test_edf.columns:
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(test_edf[col].values)
  test_sequences = tokenizer.texts_to_sequences(test_edf[col].values)
  essay_test_data_append = pad_sequences(test_sequences, maxlen=maxlen)
  essay_test_data_append = pd.DataFrame(essay_test_data_append)
  essay_test_data = pd.concat([essay_test_data,essay_test_data_append], axis=1)
  print(col)
  
print(essay_test_data.shape)

In [None]:
essay_test_data.columns = list(range(0,essay_test_data.shape[1]))
essay_test_data.tail(3)

In [None]:
input_test = essay_test_data.values
input_test = essay_train_data.values

input_train = input_train[:5000]
y_train = y_train[:5000]
input_test = input_test[:5000]

In [None]:


model = Sequential()
model.add(Embedding(50000, 32))
model.add(layers.Bidirectional(layers.GRU(32),
                              input_shape=(None,input_train.shape[-1],)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(input_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')

plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
input_test = essay_test_data[essay_one_features].values
input_test.shape

In [None]:
# Prediction on test site
essay_pred = pd.DataFrame(model.predict(input_test))
essay_pred.columns = ['project_is_approved']
essay_pred_sample_file = pd.concat([test_ids,essay_pred], axis=1)
essay_pred_sample_file.to_csv('Submissions/bigru_predictions.csv', index=False)
essay_pred_sample_file.tail(10)

### Joining Predicition

In [None]:
train_jdf = pd.concat([top_data, logpred_proba_train], axis=1)
test_jdf 



cnn_data = pd.concat([top_data, train_df_classes], axis=1)
top_data.head(10)



In [None]:
top_data.shape

In [None]:
input_train = top_data.drop(['project_is_approved'], axis=1).values
test_train =
y_train = train_df_classes

In [None]:
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(4,)))
model.add(Dense(32, activation='relu',))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(input_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

In [None]:

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')

plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Prediction on test site
cnn = pd.DataFrame(model.predict(input_test))
cnn.columns = ['project_is_approved']
cnn_sample_file = pd.concat([test_ids,cnn], axis=1)
cnn_sample_file.to_csv('Submissions/cnn_predictions.csv', index=False)
cnn_sample_file.tail(10)