In [1]:
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn import linear_model, metrics

# Reading Data

In [2]:
def segmentWords(s): 
    return s.split()

def readFile(fileName):
    # Function for reading file
    # input: filename as string
    # output: contents of file as list containing single words
    contents = []
    f = open(fileName)
    for line in f:
        contents.append(line)
    f.close()
    result = segmentWords('\n'.join(contents))
    return result

#### Create a Dataframe containing the counts of each word in a file

In [3]:
d = []

for c in os.listdir("data_training"):
    directory = "data_training/" + c
    for file in os.listdir(directory):
        words = readFile(directory + "/" + file)
        e = {x:words.count(x) for x in words}
        e['__FileID__'] = file
        if c == 'pos':
            e['__CLASS__'] = 1
        elif c == 'neg':
            e['__CLASS__'] = 0
        d.append(e)

In [4]:
df = pd.DataFrame(d)
df.describe()    

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,...,1.0,1.0,1.0,1.0,4.0,3.0,2.0,1.0,1.0,2.0
mean,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.666667,6.0,...,1.0,2.0,1.0,1.0,2.25,1.333333,1.0,1.0,1.0,1.0
std,,,,,,,,,0.57735,5.656854,...,,,,,0.957427,0.57735,0.0,,,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,4.0,...,1.0,2.0,1.0,1.0,1.75,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,6.0,...,1.0,2.0,1.0,1.0,2.5,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,8.0,...,1.0,2.0,1.0,1.0,3.0,1.5,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,10.0,...,1.0,2.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0


Create a dataframe from d - make sure to fill all the nan values with zeros.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html


In [5]:
df = df.fillna(0)
df.head()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Split data into training and validation set 

* Sample 80% of your dataframe to be the training data

* Let the remaining 20% be the validation data (you can filter out the indicies of the original dataframe that weren't selected for the training data)

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [6]:
train = df.sample(n = int(df.rows.size * .8),axis = 0)

In [7]:
train.size

58461440

In [8]:
validation = df.drop(train.index)
validation.size

14615360

* Split the dataframe for both training and validation data into x and y dataframes - where y contains the labels and x contains the words

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [9]:
X_file = train.pop('__FileID__')

y_train = train.pop('__CLASS__')
X_train = train

X_file_verif = validation.pop('__FileID__')

y_validation = validation.pop('__CLASS__')
X_validation = validation
X_train.head()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Logistic Regression

#### Basic Logistic Regression
* Use sklearn's linear_model.LogisticRegression() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [11]:
logreg = sklearn.linear_model.LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
metrics.accuracy_score(y_train, logreg.predict(X_train))

1.0

In [13]:
metrics.accuracy_score(y_validation, logreg.predict(X_validation))

0.82187500000000002

#### Changing Parameters

In [14]:
logreg_2 = sklearn.linear_model.LogisticRegression(C= 1e-2, tol = 0.01)
logreg_2.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.01,
          verbose=0, warm_start=False)

In [15]:
metrics.accuracy_score(y_train, logreg_2.predict(X_train))

0.9921875

In [16]:
metrics.accuracy_score(y_validation, logreg_2.predict(X_validation))

0.84687500000000004

#### Feature Selection
* In the backward stepsize selection method, you can remove coefficients and the corresponding x columns, where the coefficient is more than a particular amount away from the mean - you can choose how far from the mean is reasonable.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.where.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.std.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.mean.html

How did you select which features to remove? Why did that reduce overfitting?

# Single Decision Tree

#### Basic Decision Tree

* Initialize your model as a decision tree with sklearn.
* Fit the data and labels to the model.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


In [None]:
from sklearn import tree, metrics, model_selection
# Initialize model
treeClf = tree.DecisionTreeClassifier(max_features='log2') # Max features so computation can run, too many inputs for my computer to handle
# Train model
treeClf.fit(X_train, y_train)
# Accuracy for Training Data
metrics.accuracy_score(y_train, treeClf.predict(X_train))

In [None]:
# Accuracy for Validation Data
metrics.accuracy_score(y_validation, treeClf.predict(X_validation))

** NOTICE PERFECT ACCURACY IN TRAINING, BUT GENERALIZES POORLY**

#### Changing Parameters
* To test out which value is optimal for a particular parameter, you can either loop through various values or look into sklearn.model_selection.GridSearchCV

References:


http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [None]:
# Initialize New Instance
# Reduce max_depth, increase min_Samples_Split, max_features lower
treeClf_mod = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='log2')
# Train model
treeClf_mod.fit(X_train, y_train)
# Accuracy for Training Data
metrics.accuracy_score(y_train, treeClf.predict(X_train))

In [None]:
# Accuracy for Validation Data
metrics.accuracy_score(y_validation, treeClf_mod.predict(X_validation))

** NOTICE THAT IT STILL HAS ISSUES GENERALIZING, BUT IS SLIGHTLY BETTER **

How did you choose which parameters to change and what value to give to them? Feel free to show a plot.

**ALEC ANSWER**
* Lowered the max depth to try to avoid the overfitting characteristic of single desicion tree
* Made the splitting on entropy to favor more homogenous groupings

In [None]:
# INPUT PLOT HERE


Why is a single decision tree so prone to overfitting?

**ALEC ANSWER**
* A tree is defined to have each terminal leaf containing zero entropy, so each output leaf for a decision tree to be a homogenous group (all the same output value), which means there is no room for allowing errors in classification of training data

# Random Forest Classifier

#### Basic Random Forest

* Use sklearn's ensemble.RandomForestClassifier() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


In [None]:
# Import dependencies
from sklearn.ensemble import RandomForestClassifier

# Initialize Model
forest = RandomForestClassifier(max_features='log2') # Again log2 because my computer cannot handle too many features
# Train model
forest.fit(X_train, y_train)
# Accuracy for Training Data
metrics.accuracy_score(y_train, forest.predict(X_train))

In [None]:
# Accuracy for Validation Data
metrics.accuracy_score(y_validation, forest.predict(X_validation))

** TRAINING ACCURACY WENT DOWN, BUT VALIDATION WENT UP, LIKELY DUE TO OVERFITTING BEING CORRECTED SLIGHTLY**

#### Changing Parameters

In [None]:
# Initialize Model
forest_mod = RandomForestClassifier(n_estimators=250, criterion='entropy', max_features='log2')
# Train model
forest_mod.fit(X_train, y_train)
# Accuracy for Training Data
metrics.accuracy_score(y_train, forest_mod.predict(X_train))

In [None]:
# Accuracy for Validation Data
metrics.accuracy_score(y_validation, forest_mod.predict(X_validation))

What parameters did you choose to change and why?

** ALEC ANSWERS**
* Increased number of estimators to lead to better generalization (more trees)
* Log2 features - computer cannot handle more
* Entropy split as valueing information gain

How does a random forest classifier prevent overfitting better than a single decision tree?

**ALEC ANSWER**
* Each tree is overfit based on a way that can be modelled as random, so an agglomeration of many trees each overfit in a different way average out to a more accurate model that can generalize better
* i.e. each indivudual overfit is "smoothed out" in the mass