In [1]:
import pandas as pd

import re

import nltk

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

## Text Cleaning

In [21]:
def porter_stemming(text):
    porter_stemmer  = PorterStemmer()
    word_tokens = text.split(" ")
    words = [porter_stemmer.stem(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def wordnet_lemmatization(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = text.split(" ")
    words = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    sentence = " ".join(filtered_sentence)
    return sentence


def remove_punctuations(text):
    new_text = ""
    punctuations = "!\"#$%&()*+-.,:;<=>?@[\]^_{|}~"
    for ch in text:
        if ch not in punctuations:
            new_text += ch
        else:
            new_text += " "
    return new_text


def text_cleaning(content):
    content = str(content)
    content = content.lower()
    content = re.sub(r'\d+', '', content)
    content = remove_punctuations(content)
    content = remove_stopwords(content)
#     content = porter_stemming(content)
#     content = wordnet_lemmatization(content)
    content = content.strip()
    return content

def messageCleaning(message):
    cleanMsg = []
    for msg in message:
        text = text_cleaning(msg)
        cleanMsg.append(text)
    return cleanMsg

## NGram Collection

In [22]:
def getNgram(msg, n):
    ngram = []
    stringArray = ngrams(msg, n)
    for grams in stringArray:
        ngram.append(" ".join(grams))
    return ngram

def getNgramArray(message, n):
    ngramArray = []
    for msg in message:
#         msg = text_cleaning(msg)
        msg = word_tokenize(msg)
        ngramArray += getNgram(msg, n)
    return ngramArray

## Saving Data

In [23]:
def saveData(msgFreqDist, name):
    df_fdist = pd.DataFrame.from_dict(msgFreqDist, orient='index')
    df_fdist.columns = ['Frequency']
    df_fdist.index.name = 'Word'
    df_fdist.to_csv(name)
    return df_fdist

## NGram Collection

In [24]:
def getGramCollection(n):
    onegramCollection = []
    onegramCollection += getNgramArray(msg1, n)
    onegramCollection += getNgramArray(msg2, n)
    onegramCollection += getNgramArray(msg3, n)
    onegramCollection += getNgramArray(msg4, n)
    return onegramCollection

def saveFregDist(n, name):
    gramCollection = getGramCollection(n)
    msgFreqDist = nltk.FreqDist(gramCollection)
    df = saveData(msgFreqDist, name)
    print(df)

## Main Code

In [27]:
# Importing data from CSV files
msg1 = pd.read_csv('Data/May2020-NotesMsg.csv')['msg'].values
msg1 = messageCleaning(msg1)
msg2 = pd.read_csv('Data/June2020-NotesMsg.csv')['msg'].values
msg2 = messageCleaning(msg2)
msg3 = pd.read_csv('Data/July2020-NotesMsg.csv')['msg'].values
msg3 = messageCleaning(msg3)
msg4 = pd.read_csv('Data/August2020-NotesMsg.csv')['msg'].values
msg4 = messageCleaning(msg4)

### 1-gram distribution

In [28]:
n = 1
name = 'singleWordFreq.csv'
df1 = saveFregDist(n, name)

                            Frequency
Word                                 
जुलाई                            1150
से                               6772
सभी                              3094
स्कूल                            1525
खुलेंगे                             1
...                               ...
com/z/ittoe                         1
com/z/jswzp/efbadeadcafbfc          1
com/z/jswzp                         1
com/z/julc/efbadeadcafbfc           1
com/z/julc                          1

[284267 rows x 1 columns]


### 2-gram distribution

In [29]:
n = 2
name = 'twoWordFreq.csv'
df2 = saveFregDist(n, name)

                                    Frequency
Word                                         
जुलाई से                                   87
से सभी                                     48
सभी स्कूल                                  18
स्कूल खुलेंगे                               1
answer following                          431
...                                       ...
theuolo com/z/ittoe                         1
theuolo com/z/jswzp/efbadeadcafbfc          1
theuolo com/z/jswzp                         1
theuolo com/z/julc/efbadeadcafbfc           1
theuolo com/z/julc                          1

[963711 rows x 1 columns]


### 3-gram distribution

In [30]:
n = 3
name = 'threeWordFreq.csv'
df3 = saveFregDist(n, name)

                                            Frequency
Word                                                 
जुलाई से सभी                                        1
से सभी स्कूल                                        1
सभी स्कूल खुलेंगे                                   1
hello thaslima welcome                              1
thaslima welcome uolo                               2
...                                               ...
service theuolo com/z/ittoe                         1
service theuolo com/z/jswzp/efbadeadcafbfc          1
service theuolo com/z/jswzp                         1
service theuolo com/z/julc/efbadeadcafbfc           1
service theuolo com/z/julc                          1

[1580652 rows x 1 columns]


### 4-gram distrbution

In [31]:
n = 4
name = 'fourWordFreq.csv'
df4 = saveFregDist(n, name)

                                                 Frequency
Word                                                      
जुलाई से सभी स्कूल                                       1
से सभी स्कूल खुलेंगे                                     1
hello thaslima welcome uolo                              1
thaslima welcome uolo congratulations                    2
welcome uolo congratulations winning                 22525
...                                                    ...
//vc service theuolo com/z/ittoe                         1
//vc service theuolo com/z/jswzp/efbadeadcafbfc          1
//vc service theuolo com/z/jswzp                         1
//vc service theuolo com/z/julc/efbadeadcafbfc           1
//vc service theuolo com/z/julc                          1

[1851726 rows x 1 columns]


### 5-gram distribution

In [32]:
n = 5
name = 'fiveWordFreq.csv'
df5 = saveFregDist(n, name)

                                                    Frequency
Word                                                         
जुलाई से सभी स्कूल खुलेंगे                                  1
hello thaslima welcome uolo congratulations                 1
thaslima welcome uolo congratulations winning               2
welcome uolo congratulations winning gift               22525
uolo congratulations winning gift voucher               22525
...                                                       ...
https //vc service theuolo com/z/ittoe                      1
https //vc service theuolo com/z/jswzp/efbadead...          1
https //vc service theuolo com/z/jswzp                      1
https //vc service theuolo com/z/julc/efbadeadc...          1
https //vc service theuolo com/z/julc                       1

[1957487 rows x 1 columns]


### 6-gram distribution

In [33]:
n = 6
name = 'sixWordFreq.csv'
df6 = saveFregDist(n, name)

                                                    Frequency
Word                                                         
hello thaslima welcome uolo congratulations win...          1
thaslima welcome uolo congratulations winning gift          2
welcome uolo congratulations winning gift voucher       22525
uolo congratulations winning gift voucher celeb...      22525
congratulations winning gift voucher celebratin...      22525
...                                                       ...
link https //vc service theuolo com/z/jswzp/efb...          1
aug https //vc service theuolo com/z/jswzp                  1
link https //vc service theuolo com/z/julc/efba...          1
aug https //vc service theuolo com/z/julc                   1
st mid term marks feedback homework                         1

[2025310 rows x 1 columns]


### 7-gram distribution

In [34]:
n = 7
name = 'sevenWordFreq.csv'
df7 = saveFregDist(n, name)

                                                    Frequency
Word                                                         
hello thaslima welcome uolo congratulations win...          1
thaslima welcome uolo congratulations winning g...          2
welcome uolo congratulations winning gift vouch...      22525
uolo congratulations winning gift voucher celeb...      22525
congratulations winning gift voucher celebratin...      22525
...                                                       ...
clicking link https //vc service theuolo com/z/...          1
fri aug https //vc service theuolo com/z/jswzp              1
clicking link https //vc service theuolo com/z/...          1
fri aug https //vc service theuolo com/z/julc               1
computer st mid term marks feedback homework                1

[2013075 rows x 1 columns]
