# NLP Pipeline

## Imports
Pandas is being used for exploratary analysis, csv is being used for reading the csv data file. Numpy is used for numpy arrays, sklearn is used for training the classification model, and nltk and string is being used for various NLP related processes.

In [1]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from nltk.stem.porter import PorterStemmer
import nltk 
import string

In [2]:
pd.read_csv('intermediate_trainset.csv')

Unnamed: 0,Season_Epi_code,Pitched_Business_Identifier,Pitched_Business_Desc,Deal_Status,Deal_Shark
0,826,Laid Brand,hair-care products made with pheromones . Laid...,0,
1,826,Wine & Design,painting classes with wine served . Wine & Des...,1,KOL
2,824,Peoples Design,a mixing bowl with a built-in scoop . Peoples ...,1,LG
3,824,Rumi Spice,saffron imported from Afghanistan . Rumi Spice...,1,MC
4,824,Wallet Buckle,a belt buckle that holds credit cards . The Wa...,1,RH
...,...,...,...,...,...
525,102,Sticky Note Holder,a Post-It note arm for laptops . a Post-It not...,0,
526,101,College Foxes Packing Boxes,a packing and organizing service based on an a...,0,
527,101,Ionic Ear,an implantable Bluetooth device requiring surg...,0,
528,101,Mr. Tod's Pie Factory,a pie company,1,BC+DJ


## Clearning the csv file and removing redundant words and punctuation

In [3]:
sklearnDesc = np.array([])
sklearnResult = np.array([])
sklearnShark = np.array([])
with open('intermediate_trainset.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            sklearnDesc = np.append(sklearnDesc, ''.join(filter(lambda x: x.isalnum() or x.isspace(), row[2].replace('\xa0', ' ').replace('Â', '').replace('-', ' ').lower())))
            sklearnResult = np.append(sklearnResult, row[-2])
            sklearnShark = np.append(sklearnShark, row[-1])
sklearnResult = sklearnResult.astype('int')
print(sklearnDesc)

['hair care products made with pheromones  laid brand is a  pheromone enriched hair care product that enhances color in addition to protecting and hydrating hair the pheromones help girls âœexude confidenceâ wherever she goes'
 'painting classes with wine served  wine  design provides space for painting classes for all ages and experience levels where a local artist guides each student with stroke by stroke instructions to ensure you paint your own unique masterpiece'
 'a mixing bowl with a built in scoop  peoples designs scooping bowl is a multi purpose mixing bowl which saves time space and money in the kitchen it features a removable spatula egg separator all in one washing and draining in addition to separating various ingredients easily'
 'saffron imported from afghanistan  rumi spice offers exceptionally high quality saffron of afganistan which adds a burst of flavor to any dish it graces the tables and kitchens of the best restaurants and retailers across the nation'
 'a belt bu

## Creating the train and test data for the ML algorithms

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sklearnDesc, sklearnResult, random_state=42)
len(X_train), len(X_test)


(397, 133)

## Stemming tokenizer

In [5]:
stemmer = PorterStemmer()

trans_table = {ord(c): None for c in string.punctuation + string.digits}   
def tokenize(text):
        tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] 
        stems = [stemmer.stem(item) for item in tokens]
        return stems

## Vectorizing the training data

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features = 5000, decode_error = "ignore", tokenizer=tokenize)
vectorizer.fit(X_train)
vectorizer.fit_transform(X_train).shape



(397, 2301)

## Multinomial Classifier

In [7]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

cls = MultinomialNB()
cls.fit(vectorizer.transform(X_train), y_train)

y_pred = cls.predict(vectorizer.transform(X_test))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



0.518796992481203
              precision    recall  f1-score   support

           0       0.65      0.27      0.38        73
           1       0.48      0.82      0.60        60

    accuracy                           0.52       133
   macro avg       0.56      0.55      0.49       133
weighted avg       0.57      0.52      0.48       133



## RBF Support Vector Classifier

In [8]:
from sklearn import svm

svm = svm.SVC(kernel = "rbf")
svm.fit(vectorizer.transform(X_train), y_train)

y_pred = svm.predict(vectorizer.transform(X_test))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.42857142857142855
              precision    recall  f1-score   support

           0       0.38      0.07      0.12        73
           1       0.43      0.87      0.58        60

    accuracy                           0.43       133
   macro avg       0.41      0.47      0.35       133
weighted avg       0.41      0.43      0.32       133



## Linear Support Vector Classifier

In [9]:
from sklearn import svm

svm = svm.SVC(kernel = "linear")
svm.fit(vectorizer.transform(X_train), y_train)

y_pred = svm.predict(vectorizer.transform(X_test))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5413533834586466
              precision    recall  f1-score   support

           0       0.62      0.44      0.51        73
           1       0.49      0.67      0.57        60

    accuracy                           0.54       133
   macro avg       0.55      0.55      0.54       133
weighted avg       0.56      0.54      0.54       133



## Decision Tree Classifier

In [10]:
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier(max_depth=5)
decisiontree.fit(vectorizer.transform(X_train), y_train)

y_pred = decisiontree.predict(vectorizer.transform(X_test))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5789473684210527
              precision    recall  f1-score   support

           0       0.57      0.92      0.71        73
           1       0.62      0.17      0.26        60

    accuracy                           0.58       133
   macro avg       0.60      0.54      0.48       133
weighted avg       0.60      0.58      0.51       133



## Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
randomforest.fit(vectorizer.transform(X_train), y_train)

y_pred = randomforest.predict(vectorizer.transform(X_test))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.45112781954887216
              precision    recall  f1-score   support

           0       0.50      0.05      0.10        73
           1       0.45      0.93      0.61        60

    accuracy                           0.45       133
   macro avg       0.47      0.49      0.35       133
weighted avg       0.48      0.45      0.33       133



## Preprocessing the test set, vectorizing and predicting

In [18]:
Final_test = np.array([])
with open('intermediate_testset.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            Final_test = np.append(Final_test, ''.join(filter(lambda x: x.isalnum() or x.isspace(), row[2].replace('\xa0', ' ').replace('Â', '').replace('-', ' ').lower())))



vectorizer.transform(Final_test)
final_pred = decisiontree.predict(vectorizer.transform(Final_test))
final_pred


array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Exporting the CSV with Deal Statis predictions

In [34]:
from pathlib import Path  
final_df = pd.read_csv('intermediate_testset.csv')
final_df['Deal_Status'] = final_pred
filepath = Path('./Final.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
final_df.to_csv(filepath)
