In [None]:
# Same stuff from last time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = (10, 10)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
# Importing the csv
climate_essay_df = pd.read_csv('data_noID.csv')

# How big is this dataframe
climate_essay_df.shape

In [None]:
subsetted_df = climate_essay_df[climate_essay_df['trt1'] == 1]
subsetted_df = subsetted_df[['trt1', 'don', 'essay']]
subsetted_df = subsetted_df.reset_index(drop = True)
subsetted_df.shape

In [None]:
subsetted_df['big_donator'] = (subsetted_df['don'] > 8.5)
subsetted_df['big_donator'] = subsetted_df['big_donator'].astype(int)
subsetted_df.head()

In [None]:
# Creating train (in-sample) and test (out-of-sample) sets
train_df, test_df = train_test_split(subsetted_df, 
                                     test_size = 0.2,       # 80/20 train/test split
                                     random_state = 123)    # Making sure everyone gets the same thing

In [None]:
# Converting the Panda's formal dataframe column into a standard vector of strings
train_essays = train_df['essay'].values
test_essays = test_df['essay'].values

# Creating a CountVectorizer object
word_counter = CountVectorizer()

# Fitting this word-counter on our train essays
word_counter.fit(train_essays)

# Transforming the train and test set essays into the word count form
test_word_counts = word_counter.transform(test_essays)
train_word_counts = word_counter.transform(train_essays)

# Dimensionality Reduction and SVM

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

tSNE is similar to PCA in that it reduces dimensionality of your data, however they are different. Notably, PCA outputs a function to transform new data, while tSNE does not. Additionally, they reduce dimensions in different ways: PCA reduces in a way that maintains the variability of the original data, while tSNE reduces in a way that maintains distances between points. Additionally tSNE is a nonlinear dimensionality reduction, where PCA is not.

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
test_Y = test_df['big_donator'].values
train_Y = train_df['big_donator'].values

In [None]:
def train_model(model_type, new_dim, kernel, C_value, min_split, max_depth, n_estimators):
    
    # Set up
    if new_dim < 999:
        dim_reduction = PCA(n_components = new_dim, random_state=123)
        dim_reduction.fit(train_word_counts.todense())
        train_reduced = dim_reduction.transform(train_word_counts.todense())
        test_reduced = dim_reduction.transform(test_word_counts.todense())
    else:
        train_reduced = train_word_counts
        test_reduced = test_word_counts
    
    # Fitting model
    if (model_type == 'svm'):
        model = SVC(kernel = kernel, C = C_value, random_state=123)
        
    elif (model_type == 'decision_tree'):
        model = DecisionTreeClassifier(min_samples_split = min_split,
                                       max_depth = max_depth, random_state = 123)
        
    elif (model_type == 'random_forest'):
        model = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth,
                                       min_samples_split = min_split, random_state = 123)
    elif (model_type == 'logistic_regression'):
        model = LogisticRegression()
        
    else:
        print("Invalid model type. Valid models include: 'svm', 'decision_tree'" \
              "'random_forest', 'logistic_regression'")
    model.fit(X = train_reduced,
              y = train_Y)
    
    # Magic plotting code - Only tested with new_dim = 2 
    if model_type == 'svm':
        if new_dim == 2:
            h = 0.2
            x_min, x_max = train_reduced[:,0].min() - 1, train_reduced[:, 0].max() + 1
            y_min, y_max = train_reduced[:,1].min() - 1, train_reduced[:, 1].max() + 1
            xx, yy = np.meshgrid(
                np.arange(x_min, x_max, h),
                np.arange(y_min, y_max, h))
            Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            plt.contourf(xx,yy,Z,cmap=plt.cm.viridis, alpha=0.8)
            plt.scatter(train_reduced[:,0],train_reduced[:,1],c=train_Y, cmap = plt.cm.viridis)
            plt.show()
    
    # Evaluation
    print("TRAIN SET")
    train_predictions = model.predict(train_reduced)
    train_accuracy = accuracy_score(train_predictions, train_Y)
    print('The training set accuracy is %0.6f' % train_accuracy)
    confusion_M = confusion_matrix(train_predictions, train_Y)
    print(confusion_M)
    print("\nTEST SET")
    test_predictions = model.predict(test_reduced)
    test_accuracy = accuracy_score(test_predictions, test_Y)
    print('The test set accuracy is %0.6f' % test_accuracy)
    confusion_M = confusion_matrix(test_predictions, test_Y)
    print(confusion_M)
    
    return(model)

<br><br><br><br>

In [None]:
svm = train_model('svm', 2, 'linear', 1e1, 'whatever', 'whatever', 'whatever')

In [None]:
svm = train_model('svm', 2, 'linear', 1e-4, 'whatever', 'whatever', 'whatever')

In [None]:
svm = train_model('svm', 2, 'rbf', 1e1, 'whatever', 'whatever', 'whatever')

<br><br><br>

In [None]:
decision_tree = train_model('decision_tree', 2, 'whatever', 'whatever', 20, 100, 'whatever')

In [None]:
decision_tree = train_model('decision_tree', 2, 'whatever', 'whatever', 200, 3, 'whatever')

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(decision_tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

<br><br><br>

In [None]:
random_forest = train_model('random_forest', 2, 'whatever', 'whatever', 20, 100, 1000)

In [None]:
random_forest = train_model('random_forest', 2, 'whatever', 'whatever', 200, 3, 1)

<br><br><br>

# Logistic Regression Revisitted

## Parameter Exploration

In [None]:
logistic_regression = train_model('logistic_regression', 1000, 'whatever',
                                  'whatever', 'whatever', 'whatever', 'whatever')

In [None]:
feature_names = np.asarray(word_counter.get_feature_names())
args = np.argsort(logistic_regression.coef_[0])
for a in args:
    print(" %s: %0.4f" % (feature_names[a], logistic_regression.coef_[0][a]))

## More Manual Implementation

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [None]:
class PyTorchLogRegress(torch.nn.Module):
    
    def __init__(self):
        super(PyTorchLogRegress, self).__init__()
        self.linear_layer = torch.nn.Linear(5, 1)
        
        
    def forward(self, x):
        linear_output = self.linear_layer(x)
        logistic_output = torch.nn.functional.sigmoid(linear_output)
        return(logistic_output)


# Make it
classifier = PyTorchLogRegress()

In [None]:
dim_reduction = PCA(n_components = 5, random_state=123)
dim_reduction.fit(train_word_counts.todense())
train_reduced = dim_reduction.transform(train_word_counts.todense())
test_reduced = dim_reduction.transform(test_word_counts.todense())

In [None]:
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(classifier.parameters())

In [None]:
train_torch_X = Variable(torch.from_numpy(train_reduced)).type(torch.FloatTensor)
train_torch_Y = Variable(torch.from_numpy(train_Y)).type(torch.FloatTensor).unsqueeze(dim = 1)

test_torch_X = Variable(torch.from_numpy(test_reduced)).type(torch.FloatTensor)
test_torch_Y = Variable(torch.from_numpy(test_Y)).type(torch.FloatTensor).unsqueeze(dim = 1)

In [None]:
loss_list = []
test_loss_list = []

for i in range(1000):
    optimizer.zero_grad()
    
    predictions = classifier(train_torch_X)
    test_predictions = classifier(test_torch_X)
    
    loss = loss_function(predictions, train_torch_Y)
    test_loss = loss_function(test_predictions, test_torch_Y)
    
    loss_list.append(loss.data[0])
    test_loss_list.append(test_loss.data[0])
    loss.backward()
    optimizer.step()

In [None]:
plt.plot(loss_list, label = 'train')
plt.plot(test_loss_list, label = 'test')
plt.legend();

In [None]:
plain_prediction_list = []

test_predictions = classifier(test_torch_X)

for i in range(len(test_predictions)):
    plain_prediction = test_predictions[i].data.numpy()[0]
    if plain_prediction < 0.5:
        plain_prediction_list.append(0)
    else:
        plain_prediction_list.append(1)

In [None]:
accuracy_score(plain_prediction_list, test_Y)

In [None]:
confusion_matrix(plain_prediction_list, test_Y)