In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('./word2vec-nlp-tutorial/labeledTrainData.tsv', header=0,
                   delimiter='\t', quoting=3)

In [3]:
split_index = train_test_split(data.index, test_size=0.20)
train = data.iloc[split_index[0],:]
validation = data.iloc[split_index[1],:]

# Data cleaning

In [4]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [5]:
def remove_html(raw_text):
    return BeautifulSoup(raw_text).get_text()


def remove_non_english_letters(text):
    return re.sub('[^A-Za-z]', ' ', text)


def convert_to_lower_case(text):
    return text.lower()


def split_text_into_individual_words(text):
    return text.split()


def remove_stopwords(words):
    def set_of_stopwords():
        return set(stopwords.words('english'))
    
    english_stopwords = set_of_stopwords()
    return [w for w in words if not w in english_stopwords]


def words_to_one_string(words):
    return ' '.join(words)


def clean_and_process_text(raw_text):
    text = remove_html(raw_text)
    only_letters = remove_non_english_letters(text)
    lower_case = convert_to_lower_case(only_letters)
    words = split_text_into_individual_words(lower_case)
    important_words = remove_stopwords(words)
    return words_to_one_string(important_words)

In [6]:
train['clean_review'] = train.review.apply(clean_and_process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Create Features

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)
train_features = vectorizer.fit_transform(train['clean_review'])
train_features = pd.DataFrame(train_features.toarray())
train_features.columns = vectorizer.get_feature_names()

In [9]:
validation['clean_review'] = validation.review.apply(clean_and_process_text)
validation_features = vectorizer.transform(validation['clean_review'])
validation_features = pd.DataFrame(validation_features.toarray())
validation_features.columns = vectorizer.get_feature_names()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
train_features.head()

Unnamed: 0,abandoned,abc,abilities,ability,able,abraham,absence,absent,absolute,absolutely,...,yet,york,young,younger,youth,zero,zizek,zombie,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,4,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
validation_features.head()

Unnamed: 0,abandoned,abc,abilities,ability,able,abraham,absence,absent,absolute,absolutely,...,yet,york,young,younger,youth,zero,zizek,zombie,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# PCA

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [17]:
scaler =  StandardScaler()
scaler = scaler.fit(train_features)
scaled_train_x = scaler.transform(train_features)
scaled_validation_x = scaler.transform(validation_features)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [19]:
scaled_train_x.shape

(20000, 5000)

In [20]:
pca = PCA(n_components=100)
principalComponents = pca.fit(scaled_train_x)

In [21]:
pca_train = principalComponents.transform(scaled_train_x)
pca_validation = principalComponents.transform(scaled_validation_x)

In [25]:
print(pca_train.shape)
print(pca_validation.shape)

(20000, 100)
(5000, 100)


# Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, confusion_matrix

In [29]:
forest = RandomForestClassifier(n_estimators=100)
forest_model = forest.fit(pca_train, train['sentiment'])

In [31]:
result = forest_model.predict(pca_validation)
print(r2_score(validation.sentiment, result))
print(confusion_matrix(validation.sentiment, result))

0.3511542174415826
[[2080  441]
 [ 370 2109]]
