In [1]:
import pandas as pd

train = pd.read_csv('..\\0.data\\raw\\imdb_train.csv')
print(len(train))
print(train.head())

test = pd.read_csv('..\\0.data\\raw\\imdb_test.csv')
print(len(test))
print(test.head())

17500
      id  labels                                               text
0   1288       0  We saw this on the shelf at the local video st...
1   2064       0  Well, you'd better if you plan on sitting thro...
2  18997       1  This is my favorite Jackie Chan movie and in a...
3  10448       0  The long list of "big" names in this flick (in...
4  16133       1  The great and underrated Marion Davies shows h...
7500
      id  labels                                               text
0  20594       1  I am decidedly not in the target audience for ...
1    602       0  Detective Russell Logan(Lou Diamond Phillips)h...
2     29       0  I had some expectation for the movie, since it...
3  20342       1  I think that this movie is very neat. You eith...
4   6230       0  Well I just gave away 95 minutes and 47 second...


In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk import stem

def clean_paragraph(para):
    lmtzr = stem.WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    p = ' '.join([lmtzr.lemmatize(token.lower()) for token in tokenizer.tokenize(para)])
    
    return p

In [5]:
import time; t0 = time.time()

train.text = [clean_paragraph(para) for para in train.text]

print(time.time() - t0)

43.145670652389526


In [6]:
train.text[0]

'we saw this on the shelf at the local video store saw coppola in the credit and got excited that wa the one and only time this movie raised any interest i could never quite work out if it wa an attempt at a humourous film that failed miserably or an attempt at a serious film that failed miserably in general the entire production seemed incredibly amatuerish the sound in particular wa absolutely dreadful especially in the scene shot in the little bar the dialogue wa so corny in part it wa unbelievable very disappointing'

## Create Classes and Assign Probabilities

In [7]:
'''Find probabilites of each class'''

print(train.labels.value_counts())

prob_of_each_class = []
totalDocCount = len(train)

for a in train.labels.value_counts():
    prob_of_each_class.append(float(a/totalDocCount))
    
prob_of_each_class = pd.Series(prob_of_each_class)
print(prob_of_each_class); prob_of_each_class.sum()

0    8764
1    8736
Name: labels, dtype: int64
0    0.5008
1    0.4992
dtype: float64


1.0

## Create Word Matrix

In [6]:
'''Get Unique Words from the text data'''

tokenDict = {}
i = 0 
for row in train.text:
    for token in row.split(' '):
        if tokenDict.get(token) == None:
            tokenDict[token] = i
            i = i + 1
        else:
            continue

len(tokenDict)

58535

## Initialize Matrix

In [7]:
from scipy.sparse import coo_matrix
import numpy as np

matrix = coo_matrix((3, 58535))
matrix = matrix.tocsr()
matrix

<3x58535 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

## Fill Matrix 

In [None]:
'''Fill the Matrix'''

for index, row in train.iterrows():
    for token in row['text'].split(' '):
        matrix[row['labels'], tokenDict[token]] = matrix[row['labels'], tokenDict[token]] + 1
        matrix[2, tokenDict[token]] = matrix[2, tokenDict[token]] + 1



In [None]:
'''Verify Matrix'''

print(matrix[:, 0].sum())

In [None]:
print(matrix[:, 0])
print(wordCountVector[tokenDict['and']])

## Make Predictions on Test Data - Load, Clean, Predict Prob

In [None]:
'''Load Test Data'''
test = pd.read_excel('..//0.data//raw/CCC_Test.xlsx')
test['Target'] = None

test.head()

In [None]:
'''Clean Test Commentary Text Data'''

import time; t0 = time.time()

test.Commentary = [clean_paragraph(para) for para in test.Commentary]

print(time.time() - t0)

In [None]:
for index, row in test.iterrows():
    pred_test_prob = 0.0
    pred_class = 0
    
    for target_class in range(0, 4):
        product = prob_of_each_class[target_class]
        
        for token in row['Commentary'].split(' '):
            if tokenDict.get(token) == None:
                continue
                
            p_value = matrix[target_class, tokenDict[token]] / wordCountVector[tokenDict[token]]
            
            if p_value != 0.0:
                product = product * p_value
                
        if product > pred_test_prob:
            pred_class = target_class
            pred_test_prob = product
            
    test.at[index, 'Target'] = pred_class

## Convert Labels back to Names

In [None]:
for index, row in test.iterrows():
    if row.Target == 0:
        test.at[index, 'Target'] = "Run_Bw_Wickets"
    if row.Target == 1:
        test.at[index, 'Target'] = "Dot"
    if row.Target == 2:
        test.at[index, 'Target'] = "Boundary"
    if row.Target == 3:
        test.at[index, 'Target'] = "Wicket"

In [None]:
test.to_csv('..//5.outputs//output.csv')

# Accuracy so far - 61.828 %

## Include Over_Run_Total Information Also

In [None]:
train.groupby(['Over_Run_Total'])['Target'].unique()

In [None]:
train.Over_Run_Total.nunique()

In [None]:
'''Create Matrix to store prob info for runs'''

matrix_runs_prob = coo_matrix((5, 37))
matrix_runs_prob = matrix_runs_prob.tocsr()
matrix_runs_prob

In [None]:
'''Fill Matrix of Runs'''

for index, row in train.iterrows():
    matrix_runs_prob[row['Target'], row['Over_Run_Total']] = matrix_runs_prob[row['Target'], row['Over_Run_Total']] + 1
    matrix_runs_prob[4, row['Over_Run_Total']] = matrix_runs_prob[4, row['Over_Run_Total']] + 1

In [None]:
print(matrix_runs_prob[:, 0])

In [None]:
'''Make New Predictions'''

for index, row in test.iterrows():
    pred_test_prob = 0.0
    pred_class = 0
    
    for target_class in range(0, 4):
        product = prob_of_each_class[target_class]
        run_prob = matrix_runs_prob[target_class, row['Over_Run_Total']] / matrix_runs_prob[4, row['Over_Run_Total']]
        
        if run_prob != 0.0:
            product = product * run_prob
        
        for token in row['Commentary'].split(' '):
            if tokenDict.get(token) == None:
                continue
                
            p_value = matrix[target_class, tokenDict[token]] / wordCountVector[tokenDict[token]]
            
            if p_value != 0.0:
                product = product * p_value
                
        if product > pred_test_prob:
            pred_class = target_class
            pred_test_prob = product
            
    test.at[index, 'Target'] = pred_class


In [None]:
for index, row in test.iterrows():
    if row.Target == 0:
        test.at[index, 'Target'] = "Run_Bw_Wickets"
    if row.Target == 1:
        test.at[index, 'Target'] = "Dot"
    if row.Target == 2:
        test.at[index, 'Target'] = "Boundary"
    if row.Target == 3:
        test.at[index, 'Target'] = "Wicket"

test.to_csv('..//5.outputs//output2.csv')

# Accuracy = 62.45% 