# Moder Methods for Text Classifications

In [4]:
from pathlib import Path
import pandas as pd
import gzip
from urllib.request import urlretrieve
from tqdm import tqdm
import os
import numpy as np

In [5]:
class TqdmUpto(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
            self.update(b* bsize - self.n)

In [6]:
def get_data(url, filename):
    if not os.path.exists(filename):
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with TqdmUpto(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
            urlretrieve(url, filename, reporthook=t.update_to)

In [7]:
# data_url = 'http://files.fast.ai/data/aclImdb.tgz'
# get_data(data_url, 'data/imdb.tgb')

In [8]:
data_path = Path(os.getcwd())/'data'/'aclImdb'
assert os.path.exists(data_path)

In [9]:
# check if we have extracted the files at the current location
for pathroute in os.walk(data_path):
    next_path = pathroute[1]
    for stop in next_path:
        print(stop)

test
train
neg
all
pos
neg
unsup
all
pos


In [10]:
train_path = data_path/'train'
test_path = data_path/'test'

In [11]:
def read_data(dir_path):
    """read data into pandas dataframe"""
    
    def load_dir_reviews(reviews_path):
        files_list = list(reviews_path.iterdir())
        reviews = []
        for filename in files_list:
            f = open(filename, 'r', encoding='utf-8')
            reviews.append(f.read())
        return pd.DataFrame({'text':reviews})
        
    
    pos_path = dir_path/'pos'
    neg_path = dir_path/'neg'
    
    pos_reviews, neg_reviews = load_dir_reviews(pos_path), load_dir_reviews(neg_path)
    
    pos_reviews['label'] = 1
    neg_reviews['label'] = 0
    
    merged = pd.concat([pos_reviews, neg_reviews])
    df = merged.sample(frac=1.0) # shuffle the rows
    df.reset_index(inplace=True) # don't carry index from previous
    df.drop(columns=['index'], inplace=True) # drop the column 'index' 
    return df

In [12]:
train_path = data_path/'train'
test_path = data_path/'test'

In [13]:
train = read_data(train_path)
test = read_data(test_path)

In [14]:
train[:5]

Unnamed: 0,text,label
0,"""Bend It Like Beckham"" reminds me of the best ...",1
1,Nay Sayers of this film are likely bitter from...,1
2,"A long time ago, in a galaxy far, far away.......",1
3,This movie explores the difficulties that stra...,1
4,The plot doesn't begin to describe the film: a...,1


In [15]:
X_train, y_train = train['text'], train['label']
X_test, y_test = test['text'], test['label']

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression as LR

In [18]:
lr_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LR())])

In [19]:
lr_clf.fit(X=X_train, y=y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression())])

In [20]:
lr_predicted = lr_clf.predict(X_test)

In [21]:
lr_acc = sum(lr_predicted == y_test)/len(lr_predicted)
lr_acc

0.88316

In [22]:
def imdb_acc(pipeline_clf):
    predictions = pipeline_clf.predict(X_test)
    assert len(y_test) == len(predictions)
    return sum(predictions == y_test)/len(y_test), predictions

In [23]:
# remove stop words 
lr_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),('clf',LR())])

In [24]:
lr_clf.fit(X=X_train, y=y_train)

Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()), ('clf', LogisticRegression())])

In [25]:
lr_acc, lr_predictions = imdb_acc(lr_clf)

In [26]:
lr_acc

0.879

In [27]:
# increase the N-gram range
lr_clf = Pipeline([('vect',CountVectorizer(stop_words='english', ngram_range=(1,3))), ('tfidf',TfidfTransformer()),
                   ('clf', LR())])
lr_clf.fit(X=X_train, y=y_train)
lr_acc, lr_prediction = imdb_acc(lr_clf)
lr_acc

0.86596

### Multinomial Naive Bayes

In [28]:
from sklearn.naive_bayes import MultinomialNB as MNB

In [29]:
mnb_clf = Pipeline([('vect', CountVectorizer()), ('clf', MNB())])

In [30]:
mnb_clf.fit(X=X_train, y=y_train)
mnb_acc, mnb_prediction = imdb_acc(mnb_clf)
mnb_acc

0.81356

In [31]:
# Add TF-IDF
mnb_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MNB())])
mnb_clf.fit(X_train, y_train)
mnb_acc, mnb_prediction = imdb_acc(mnb_clf)
mnb_acc

0.82956

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier