#### pandas for data frame
#### the next 4 are for the classifiers
#### CalibratedClassifierCV is to get probability for GaussianNB and LinearSVC
#### CountVectorizer is to create a vector for words i.e. each word will be given a specific number
#### pipeline has a function transformermixin, to create our own, we use this library
#### Pipeline -> an input is given then the first function in the pipeline is run, the output of the first function becomes the input to the second function in the pipeline and so on
#### spacy and stopwords
#### punctuations
#### for many files in a folder
#### random to randomize sentiment data
#### numpy for np.nan

In [1]:
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

from string import punctuation as punctuations

import os
import glob
from docx import Document

import random

import numpy as np

#### enter path of training data here

In [2]:
df = pd.read_json("C:/Users/jaski/OneDrive/Desktop/igt/ML_EMAIL6 - OTA.csv")

#### loading english in spacy

In [3]:
nlp = spacy.load('en')

#### default functions for spacy 
#### and spacy tokenizer is as we want to tokenize our sentences

#### our tokenizer ignores all pronouns, proper nouns, stopwords and punctuations and converts string to lowercase

In [4]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

def clean_text(text):     
    return text.strip().lower()

def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [tok.lemma_.lower().strip() for tok in tokens if tok.pos_ != "PROPN" and tok.pos_ != "PRON"]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return tokens

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

#### the ml algorithms used are : LogisticRegression, Gaussian Naive Bayes, and Linear Support vector classification

In [5]:
classifier1 = LogisticRegression(solver="lbfgs", multi_class="auto")
classifier2 = GaussianNB()
classifier3 = LinearSVC()
classifier4 = LogisticRegression(solver="lbfgs")
classifier2 = CalibratedClassifierCV(classifier2, cv=3)
classifier3 = CalibratedClassifierCV(classifier3, cv=3)

#### creating the pipelines

In [6]:
pipe1 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier1)])
pipe2 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('to_dense', DenseTransformer()), 
                 ('classifier', classifier2)])
pipe3 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('to_dense', DenseTransformer()), 
                 ('classifier', classifier3)])


pipe4 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier4)])

#### randomizing our df

In [7]:
df = df.sample(frac=1).reset_index(drop=True)

#### creating training data

In [8]:
#train_df = df[:110]
#test_df = df[110:]
train_df = df
train_X = train_df["text"]
train_y = train_df["label"]
#test_X = test_df["text"]
#test_y = test_df["label"]

## fitting all the data except sentiment
## at one time, we can have only one set of data
## so first we do all the task with email categorisation then we move on to sentiment

In [9]:
pipe1.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x00000220BB0EA2B0>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [10]:
pipe2.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x00000220BB0EA278>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...V(base_estimator=GaussianNB(priors=None, var_smoothing=1e-09),
            cv=3, method='sigmoid'))])

In [11]:
pipe3.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x00000220BB0EA4A8>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ... penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=3, method='sigmoid'))])

In [12]:
#print(pipe1.score(test_X, test_y))
#print(pipe2.score(test_X, test_y))
#print(pipe3.score(test_X, test_y))

#### creating a voted classifier using the 3 algorithms for the best result
#### if 2 of the classifiers predict the same category, we take that category as the final output, probability becomes the mean of the two probabilities
#### if all 3 same give different categories, we take the one with the maximum confidence

In [13]:
def predict(data):
    data = [data]
    pred = []
    prob = []
    pred.append(pipe1.predict(data))
    pred.append(pipe2.predict(data))
    pred.append(pipe3.predict(data))
    prob.append(max(max(pipe1.predict_proba(data))))
    prob.append(max(max(pipe2.predict_proba(data))))
    prob.append(max(max(pipe3.predict_proba(data))))
    for i in range(len(pred)):
        for j in range(i+1,len(pred)):
            if pred[i] == pred[j]:
                return [' '.join(pred[i]), (prob[i] + prob[j])/2]
    index = prob.index(max(prob))
    return [' '.join(pred[index]), max(prob)]

#### path to folder containing all the email files

In [14]:
path = "C:/Users/jaski/OneDrive/Documents/python/one_many"

#### change the extension to get any other files
#### will take only the files with that extension

In [15]:
files = []
filenames = []
for filename in glob.glob(os.path.join(path, '*.docx')):
    f = Document(filename)
    filename = filename.split('\\')
    filename = filename[1]
    filenames.append(filename)
    file = []
    for para in f.paragraphs:
        text += para.text
        text += "\n"
        new_text = ""
        for t in text:
            if t == "\n":
                file.append(new_text)
                new_text = ""
            else:
                new_text += t
    file = [f for f in file if f != '']
    files.append(file)

#### path to positive, negative training data

In [16]:
short_pos = open("C:/Users/jaski/OneDrive/Desktop/igt/own_positive.txt", "r").read()
short_neg = open("C:/Users/jaski/OneDrive/Desktop/igt/own_negative.txt", "r").read()

#### manipulating the sentiment data and shuffling it to train

In [17]:
docx = []
for r in short_pos.split("\n"):
    docx.append((r, "pos"))
for r in short_neg.split("\n"):
    docx.append((r, "neg"))
random.shuffle(docx)

In [18]:
train_data_sentiment = docx

#### df_pred_category -> [ category of para1, para2, para3 ]
#### df_prob_category -> [ probability of para1, para2, para3 ]
#### df_pred_sentiment -> [ pos, neut, neg]

In [19]:
df_pred_category = []
df_prob_category = []
df_pred_sentiment = []

#### storing categories for each para
#### we sort before appending np.nan because sorting after will give random results
#### np.nan will give blank in csv or excel
#### len(file, 3) will work only if len of file is less than 3 
#### if it is greater than 3, it will not go in the loop
#### in the end, we take the first 3 elements

In [20]:
for file in files:
    local_pred = []
    local_prob = []
    for para in file:
        predprob = predict(para)
        local_pred.append(predprob[0])
        local_prob.append(predprob[1])
    local_prob, local_pred = zip(*sorted(zip(local_prob, local_pred)))
    for i in range(len(file),3):
        local_pred.append(np.nan)
        local_prob.append(np.nan)
    df_pred_category.append(local_pred[:3])
    df_prob_category.append(local_prob[:3])

## our tasks with email categorization finish and we move on to sentiment

#### fitting sentiment data

In [21]:
pipe4.fit([x[0] for x in train_data_sentiment], [x[1] for x in train_data_sentiment])

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x00000220BB0EA470>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

#### probability < 0.65 will be neutral
#### array -> [ pos, neut, neg ]

In [22]:
for file in files:
    test_sentiment_data = ' '.join(file)
    local_pred = pipe4.predict([test_sentiment_data])
    local_prob = max(max(pipe4.predict_proba([test_sentiment_data])))
    if local_prob < 0.65:
        df_pred_sentiment.append([np.nan, "neut", np.nan])
    elif local_pred == "pos":
        df_pred_sentiment.append([local_prob, np.nan, np.nan])
    else:
        df_pred_sentiment.append([np.nan, np.nan, local_prob])

#### creating dictionary to create the df

In [23]:
df_dict = {"ID":filenames,
          "Category 1": [df[0] for df in df_pred_category], "Prob 1": [df[0] for df in df_prob_category],
          "Category 2": [df[1] for df in df_pred_category], "Prob 2": [df[1] for df in df_prob_category],
          "category 3": [df[2] for df in df_pred_category], "Prob 3": [df[2] for df in df_prob_category],
          "Positive Feedback": [df[0] for df in df_pred_sentiment],
          "Neutral Feedback": [df[1] for df in df_pred_sentiment],
          "Negative Feedback": [df[2] for df in df_pred_sentiment]}

#### creating df then to csv

In [24]:
final_df = pd.DataFrame(df_dict)

#### Path to csv file

In [25]:
final_df.to_csv("C:/Users/jaski/OneDrive/Desktop/igt/email_para_cat_sentiment.csv")