# TEMPLATE FOR CLASSIFICATION PROBLEMS

## INPUTS

In [89]:
#the input file contains both the trained and the non-trained (to be classified) dataset
#it must be in csv format, with headings of the columns in the first row
#it must also be in the same directory as this ipython notebook
#the name must be in single or double quotes, i.e. input_file = "random_filename.csv"]
#furthermore, there can be entries with null values for the text column.
input_file = "random_filename.csv"

#the column containing the text field [ex: text_column="random_text_column_name"]:
text_column = "random_text_column_name"

#the column containing the binary indicator of which class each row falls into [ex: binary_classifier_column = "binary_classifier_name"]:
#this is empty for the rows which haven't been classified
binary_classifier_column = "binary_classifier_column_name"

output_file = "predictions.csv"

In [92]:
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
import re,csv

import nltk.tokenize as tokenize

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import cross_validation, grid_search
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import linear_model, decomposition
from sklearn import metrics, cross_validation, ensemble, svm, linear_model, naive_bayes
import nltk

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords, words

import matplotlib
matplotlib.style.use('ggplot')

## READING IN THE DATA

In [50]:
df = pd.read_csv(input_file, encoding='latin-1') #reads csv into dataframe
#df = df.dropna(subset=[text_column]) #drop row if no description (NaN)
dfs = df[text_column]
dfs = [dfs1.encode('utf-8') for dfs1 in dfs]
df[text_column] = dfs

Xtextcol=df[text_column] #takes only the first column
Y=df[binary_classifier_column]

## get the indices of the rows that are to be used to train the model, and the rows that need to be predicted
index_of_nulls = df[df.isnull().any(axis=1)].index.tolist()
index_of_trained = df[~df.isnull().any(axis=1)].index.tolist()

There are two inputs to the model, as described below:

(1) Xtextcol => an array containing the text fields, each row a separate text entry

(2) Y => a binary indicator of which class each row falls into

## SEPARATING DATA INTO TESTING & TRAINING SET

In [55]:
cvect = CountVectorizer(ngram_range=(1, 2), stop_words=nltk.corpus.stopwords.words('english')) #tokenizes the text
Xvect = cvect.fit_transform(Xtextcol)
Xvect_names = cvect.get_feature_names()
tfidf = TfidfTransformer()
Xtfidf = tfidf.fit_transform(Xvect) #tfidf transformation

Xscore = Xtfidf[index_of_trained,:] #training and testing dataset [i.e. made up of classified rows]
Xpred = Xtfidf[index_of_nulls,:]
dfY = df.dropna(subset=[binary_classifier_column]) #keeping only the Ys that are classified

x_train, x_test, y_train, y_test = cross_validation.train_test_split(Xscore,dfY[binary_classifier_column],test_size=0.2)
print(np.shape(dfY))
print(np.shape(Xscore))

(447, 3)
(447, 9685)


## BUILDING PIPELINE, TESTING DIFFERENT MODELS

In [110]:
ESTIMATORS = {
    "svm": svm.LinearSVC(C=1.0),
    "ridge": linear_model.RidgeClassifierCV(alphas=(0.1, 0.5, 1.0, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 10.0),normalize=True),
    "SGD": linear_model.SGDClassifier(alpha=.0001,n_iter=50,penalty="elasticnet"),
    "perceptron": linear_model.Perceptron(n_iter=50),
    "passive_aggressive": linear_model.PassiveAggressiveClassifier(n_iter=50),
    "random forest": ensemble.RandomForestClassifier(),
}

y_test_predict = dict()
y_name = dict()
print "We're going to test the above algorithms (svm, ridge, SGD, perceptron, passive agrressive, random forest) to see which one yields the best predictions. \n\n"
for name, estimator in ESTIMATORS.items():
    y_name[name] = estimator
    y_name[name].fit(x_train, y_train)
    y_test_predict[name] = y_name[name].predict(x_test)
    print "ALGORITHM NAME: %s \n" % name
    print "%s score: %.2f" % (name, (y_name[name].score(x_test, y_test)))
    print "%s confusion matrix:" % name
    print(metrics.confusion_matrix(y_test,y_test_predict[name]))
    print "%s classification report:" %name
    print(metrics.classification_report(y_test, y_test_predict[name]))
    print "%s Matthew's correlation coefficient: %.2f \n \n" % (name, metrics.matthews_corrcoef(y_test,y_test_predict[name]))

We're going to test the above algorithms (svm, ridge, SGD, perceptron, passive agrressive, random forest) to see which one yields the best predictions. 


ALGORITHM NAME: svm 

svm score: 0.76
svm confusion matrix:
[[40  3]
 [19 28]]
svm classification report:
             precision    recall  f1-score   support

        1.0       0.68      0.93      0.78        43
        4.0       0.90      0.60      0.72        47

avg / total       0.80      0.76      0.75        90

svm Matthew's correlation coefficient: 0.55 
 

ALGORITHM NAME: ridge 

ridge score: 0.73
ridge confusion matrix:
[[42  1]
 [23 24]]
ridge classification report:
             precision    recall  f1-score   support

        1.0       0.65      0.98      0.78        43
        4.0       0.96      0.51      0.67        47

avg / total       0.81      0.73      0.72        90

ridge Matthew's correlation coefficient: 0.54 
 

ALGORITHM NAME: passive_aggressive 

passive_aggressive score: 0.73
passive_aggressive confusion 

## CHOOSING A MODEL

Choose the model with the best score, or whichever other metric seems best.

For example, if we wanted to use the perceptron algorithm (score: .81, second best in this run), to spit out the probabilities for the untrained dataset, we could do so using the commands below:

In [94]:
#to calculate which class they fall into:
class_of_predicted = y_name['perceptron'].predict(Xpred)

#our confidence in our class predictions:
scores_of_predicted = y_name['perceptron'].decision_function(Xpred)

#to calculate the probability [0-.5 means 1; .5 to 1 means 4]:
d = y_name['perceptron'].decision_function(Xpred)
probs_of_predicted = np.exp(d) / (1 + np.exp(d)) 

with open(output_file, 'wb') as csvfile:
    output_text = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    output_text.writerow(['row_index','text','predicted class','confidence in predicted class','probability'])
    for i in range(len(probs_of_predicted)):
        output_text.writerow([index_of_nulls[i],df.iloc[index_of_nulls[i]][text_column].strip(),class_of_predicted[i],scores_of_predicted[i],probs_of_predicted[i]])


If you wanted to use a different model, i.e. ridge, you would replace y_name['perceptron'] with y_name['ridge']