In [1]:
#libraries required

import pandas as pd
import numpy as np
import re
import unicodedata

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS.add('tyre')
STOP_WORDS.add(' tyre')
STOP_WORDS.add('tyre ')
STOP_WORDS.add(' tyre ')
STOP_WORDS.add('great')
STOP_WORDS.add('good')
STOP_WORDS.add('easy')
STOP_WORDS.add('excellent')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#model
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

#plots
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#loading small corpus
nlp = spacy.load('en_core_web_sm')

#reading data
df = pd.read_csv("sentisum-assessment-dataset.csv")
#some cleaning
df = df.drop(['Unnamed: 1'],axis=1)
#renaming
df.columns = ["document"]

#list of contractions and their related expansions (from web)
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"tbh":"to be honest" }

#functions

def get_avg_word_len(x):
    """Get the average word length from a given sentence
    param x(str): the sentence of whose word length is to be taken
    return leng(numeric): the average word length """

    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len + len(word)
    return word_len/len(words)

def feature_extract(df,d):
    """Adds new columns in the given df, from the existing data
    count: number of words in the document (df[d])
    char count: number of characters in df[d]
    avg word_len: the average number of characters in the df[d]
    stop_words_len: number of stopwords present
    numeric_count: number of numeric characters present
    upper_counts: number of words in CAPS LOCK
    
    param df(dataframe): dataframe on which manipulation is to be done
    param d(str): column name in which the reuired words are present"""
    
    df['count']=df[d].apply(lambda x: len(str(x).split()))
    df['char count']=df[d].apply(lambda x: len(x))
    df['avg word_len'] = df[d].apply(lambda x:get_avg_word_len(x))
    df['stop_words_len'] = df[d].apply(lambda x: len([t for t in x.split() if t in STOP_WORDS]))
    df['numeric_count'] = df[d].apply(lambda x:len([t for t in x.split()if t.isdigit()] ))
    df['upper_counts'] = df[d].apply(lambda x: len([t for t in x.split() if t.isupper() and len(x)>3]))


def expand(x):
    """Some of the words like 'i'll', are expanded to 'i will' for better text processing
    The list of contractions is taken from the internet
    
    param x(str): the sentence in which contractions are to be found and expansions are to be done
    
    return x(str): the expanded sentence"""
    if type(x)== str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key,value)
        return x
    else:
        return x

def remove_accented_chars(x):
    """The function changes the accented characters into their equivalent normal form,
    to do so, normalize function with 'NFKD' is used which replaces the compatibility characters into
    theri euivalent
    
    param x(str): the sentence in which accented characters are to be detected and removes
    return x(str): sentence with accented characters replaced by their equivalent"""
    
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x


def make_to_base(x):
    """Converting the words to their base word and dictionary head word i.e to lemmatize
    param x(str): the sentence in which the words are to be converted (lemmatization)
    return x(str): the lemmatized sentence"""
    
    x_list = []
    doc = nlp(x)
    
    for token in doc:
        lemma = str(token.lemma_)
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text
        x_list.append(lemma)
    return (" ".join(x_list))
    
def preprocess(df,d):
    """Preprocesses the given document by applying the following functionalities
    lower: lowers all the characters for uniformity
    expansion: expands words like i'll to i will for better text classification
    remove special characters: using regex, removes all the punctuations etc
    remove space: removes trailing spaces and extra spaces between words
    remove accented characters: change accented characters to its normal equivalent
    remove stop words: removes the stop words in the sentence
    lemmatization: changes the words to their base form"""
    
    df[d] = df[d].apply(lambda x: x.lower())
    df[d] = df[d].apply(expand)
    df[d] = df[d].apply(lambda x: re.sub('[^A-Z a-z 0-9-]+', '', x))
    df[d] = df[d].apply(lambda x: " ".join(x.split()))
    df[d] = df[d].apply(lambda x: remove_accented_chars(x))
    df[d] = df[d].apply(lambda x: make_to_base(x))
    df[d] = df[d].apply(lambda x: " ".join([t for t in x.split() if t not in STOP_WORDS]))

def get_bow(df,d):
    cv = CountVectorizer(ngram_range=(3,4),min_df=5, max_df=0.5)
    text_counts = cv.fit_transform(df[d])
    bow = pd.DataFrame(text_counts.toarray(), columns = cv.get_feature_names())
    return cv,bow

def tf_bow(df,d):
    tf = TfidfVectorizer(ngram_range=(3,4),min_df=5, max_df=0.5)
    text_counts = tf.fit_transform(df[d])
    tf_df = pd.DataFrame(text_counts.toarray(), columns = tf.get_feature_names())
    return tf,tf_df

feature_extract(df,'document')
preprocess(df,'document')
cvectorizer,data_cv = get_bow(df,'document')
tvectorizer,data_tf = tf_bow(df,'document')

# Use LDA to look for 12 topics
n_topics = 12
model_lda = LatentDirichletAllocation(n_components=n_topics,random_state=0)
model_lda.fit(data_cv)

# Print the top 10 words per topic
#n_words = 10
#feature_names = cvectorizer.get_feature_names()

#topic_list = []
#for topic_idx, topic in enumerate(model_lda.components_):
#    top_n = [feature_names[i]
#             for i in topic.argsort()
#                 [-n_words:]][::-1]
    
#    top_features = ' '.join(top_n)
    
#    topic_list.append(f"topic_{'_'.join(top_n[:3])}") 

#    print(f"Topic {topic_idx}: {top_features}")
#    print('\n\n\n')
                      

topic_result = model_lda.transform(data_cv)
df['topic']  = topic_result.argmax(axis=1)
                      
#fig = plt.figure(figsize=(80,32))
#for i in range(12):
#    ax = fig.add_subplot(4,3,i+1)
#    topic = i
#    text = ' '.join(df.loc[df['topic']==i,'document'].values)
#    wc = WordCloud(width=1000, height=1000, random_state=1, background_color='Black',colormap='Set2',collocations=False).generate(text)
#    ax.imshow(wc)
#    ax.set_title(topic)
#    ax.axis("off");
                      
def assign_topic(x):
    if x==0:
        return "ease of booking"
    elif x==1:
        return "value for money"
    elif x==2:
        return "garage service"
    elif x==3:
        return "length of fitting"
    elif x==4:
        return "mobile fitter"
    elif x==5:
        return "tyre quality"
    elif x==6:
        return "delivery punctuality"
    elif x==7:
        return "booking confusion"
    elif x==8:
        return "location"
    elif x==9:
        return "wait time"
    elif x==10:
        return "change of date"
    elif x==11:
        return "discounts"
    
df['topic'] = df['topic'].apply(lambda x:assign_topic(x))

