## **Import librabies**

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pickle
import os
import re 

sns.set_style("whitegrid")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
from nltk.stem import PorterStemmer
import nltk
# nltk.download('stopwords')
# nltk.download()

from collections import Counter

## **Loading data**

In [37]:
df_train = pd.read_csv('train.csv')


In [38]:
df_train.drop(columns='Unnamed: 0', inplace=True)

In [39]:
df_train['Phrase']

0                                                positively
1         disguise the slack complacency of -LRB- Godard...
2         realizes a fullness that does not negate the s...
3                                                    sealed
4                             Go Where We Went 8 Movies Ago
                                ...                        
117040    It 's a boom-box of a movie that might have be...
117041                                           advantages
117042                                   a conventional way
117043    like Kubrick before him , may not touch the pl...
117044                                         all go wrong
Name: Phrase, Length: 117045, dtype: object

## **Count keywords**

In [46]:
vocab = Counter()

for sentence in df_train['Phrase'][:10]:
    for word in sentence.split(' '):
        vocab['word'] += 1

vocab.most_common()

[('word', 65)]

In [47]:
vocab = Counter()

for sentence in df_train.Phrase:
    for words in sentence.split(' '):
        vocab[words] += 1

vocab.most_common(100)

[('the', 34583),
 (',', 31356),
 ('a', 25066),
 ('of', 24153),
 ('and', 23762),
 ('to', 16686),
 ('.', 13131),
 ("'s", 12630),
 ('in', 10128),
 ('is', 9934),
 ('that', 9071),
 ('it', 7719),
 ('as', 6340),
 ('with', 5711),
 ('for', 5445),
 ('its', 5243),
 ('film', 4953),
 ('an', 4457),
 ('movie', 4353),
 ('this', 3805),
 ('be', 3803),
 ('but', 3724),
 ('on', 3517),
 ('The', 3481),
 ('you', 3440),
 ("n't", 2935),
 ('by', 2874),
 ('more', 2865),
 ('his', 2783),
 ('about', 2692),
 ('from', 2576),
 ('than', 2568),
 ('at', 2566),
 ('--', 2550),
 ('or', 2547),
 ('not', 2533),
 ('one', 2526),
 ('all', 2263),
 ('have', 2260),
 ('are', 2231),
 ('like', 2137),
 ("'", 2124),
 ('has', 2061),
 ('A', 2024),
 ('so', 1898),
 ('story', 1824),
 ('-RRB-', 1820),
 ('out', 1758),
 ('who', 1717),
 ('most', 1638),
 ('into', 1614),
 ('too', 1575),
 ('-LRB-', 1556),
 ('up', 1539),
 ('good', 1488),
 ('their', 1438),
 ('characters', 1411),
 ('...', 1379),
 ('`', 1370),
 ("''", 1360),
 ('``', 1357),
 ('can', 1347)

## **Preprocessing**

#### **Stopwords**

In [15]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#### **Special characters**

In [16]:
def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # Also Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

#### **Stemming/Lemmatize**

In [17]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()


In [18]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [19]:
def tokenizer_lemma(text):
    return [wnl.lemmatize(word, pos='a') for word in text.split()]

## **TRAINING COUNT**

In [20]:
X = df_train['Phrase']
y = df_train['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=693)

In [21]:
count = CountVectorizer(stop_words=stop_words,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

In [None]:
clf = Pipeline([('vect', count),
                ('clf', LogisticRegression(random_state=0))])

clf.fit(X_train, y_train)

In [None]:
# Now apply those above metrics to evaluate your model
# Your code here
predictions = clf.predict(X_test)
print('accuracy:',accuracy_score(y_test,predictions))
print('confusion matrix:\n',confusion_matrix(y_test,predictions))
print('classification report:\n',classification_report(y_test,predictions))