In [1]:
import pandas as pd

import re

import nltk

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

## Text Cleaning

In [35]:
def porter_stemming(text):
    porter_stemmer  = PorterStemmer()
    word_tokens = text.split(" ")
    words = [porter_stemmer.stem(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def wordnet_lemmatization(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = text.split(" ")
    words = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    sentence = " ".join(filtered_sentence)
    return sentence


def remove_punctuations(text):
    new_text = ""
    punctuations = "!\"#$%&()*+-.,:;<=>?@[\]^_{|}~"
    for ch in text:
        if ch not in punctuations:
            new_text += ch
        else:
            new_text += " "
    return new_text


def text_cleaning(content):
    content = str(content)
    content = content.lower()
    content = re.sub(r'\d+', '', content)
    content = remove_punctuations(content)
#     content = remove_stopwords(content)
#     content = porter_stemming(content)
#     content = wordnet_lemmatization(content)
    content = content.strip()
    return content

def messageCleaning(message):
    cleanMsg = []
    for msg in message:
        text = text_cleaning(msg)
        cleanMsg.append(text)
    return cleanMsg

## Saving Data

In [37]:
def saveData(msgFreqDist, name):
    df_fdist = pd.DataFrame.from_dict(msgFreqDist, orient='index')
    df_fdist.columns = ['Frequency']
    df_fdist.index.name = 'Word'
    df_fdist.to_csv(name)
    return df_fdist

## NGram Collection

In [47]:
def getNgram(msg, n):
    ngram = []
    stringArray = ngrams(msg, n)
    for grams in stringArray:
        ngram.append(" ".join(grams))
    return ngram

def getNgramArray(message, n):
    ngramArray = []
    for msg in message:
#         msg = text_cleaning(msg)
        msg = word_tokenize(msg)
        ngramArray += getNgram(msg, n)
    return ngramArray


def getGramCollection(n):
    onegramCollection = []
    onegramCollection += getNgramArray(msg1, n)
    onegramCollection += getNgramArray(msg2, n)
    onegramCollection += getNgramArray(msg3, n)
    onegramCollection += getNgramArray(msg4, n)
    return onegramCollection

def saveFregDist(n, name):
    gramCollection = getGramCollection(n)
    msgFreqDist = nltk.FreqDist(gramCollection)
    df = saveData(msgFreqDist, name)
    print(df)
    return df

## Main Code

In [39]:
# Importing data from CSV files
msg1 = pd.read_csv('Data/May2020-NotesMsg.csv')['msg'].values
msg1 = messageCleaning(msg1)
msg2 = pd.read_csv('Data/June2020-NotesMsg.csv')['msg'].values
msg2 = messageCleaning(msg2)
msg3 = pd.read_csv('Data/July2020-NotesMsg.csv')['msg'].values
msg3 = messageCleaning(msg3)
msg4 = pd.read_csv('Data/August2020-NotesMsg.csv')['msg'].values
msg4 = messageCleaning(msg4)

### 1-gram distribution

In [28]:
n = 1
name = 'singleWordFreq.csv'
df1 = saveFregDist(n, name)

                            Frequency
Word                                 
जुलाई                            1150
से                               6772
सभी                              3094
स्कूल                            1525
खुलेंगे                             1
...                               ...
com/z/ittoe                         1
com/z/jswzp/efbadeadcafbfc          1
com/z/jswzp                         1
com/z/julc/efbadeadcafbfc           1
com/z/julc                          1

[284267 rows x 1 columns]


### 2-gram distribution

In [29]:
n = 2
name = 'twoWordFreq.csv'
df2 = saveFregDist(n, name)

                                    Frequency
Word                                         
जुलाई से                                   87
से सभी                                     48
सभी स्कूल                                  18
स्कूल खुलेंगे                               1
answer following                          431
...                                       ...
theuolo com/z/ittoe                         1
theuolo com/z/jswzp/efbadeadcafbfc          1
theuolo com/z/jswzp                         1
theuolo com/z/julc/efbadeadcafbfc           1
theuolo com/z/julc                          1

[963711 rows x 1 columns]


### 3-gram distribution

In [30]:
n = 3
name = 'threeWordFreq.csv'
df3 = saveFregDist(n, name)

                                            Frequency
Word                                                 
जुलाई से सभी                                        1
से सभी स्कूल                                        1
सभी स्कूल खुलेंगे                                   1
hello thaslima welcome                              1
thaslima welcome uolo                               2
...                                               ...
service theuolo com/z/ittoe                         1
service theuolo com/z/jswzp/efbadeadcafbfc          1
service theuolo com/z/jswzp                         1
service theuolo com/z/julc/efbadeadcafbfc           1
service theuolo com/z/julc                          1

[1580652 rows x 1 columns]


### 4-gram distrbution

In [31]:
n = 4
name = 'fourWordFreq.csv'
df4 = saveFregDist(n, name)

                                                 Frequency
Word                                                      
जुलाई से सभी स्कूल                                       1
से सभी स्कूल खुलेंगे                                     1
hello thaslima welcome uolo                              1
thaslima welcome uolo congratulations                    2
welcome uolo congratulations winning                 22525
...                                                    ...
//vc service theuolo com/z/ittoe                         1
//vc service theuolo com/z/jswzp/efbadeadcafbfc          1
//vc service theuolo com/z/jswzp                         1
//vc service theuolo com/z/julc/efbadeadcafbfc           1
//vc service theuolo com/z/julc                          1

[1851726 rows x 1 columns]


### 5-gram distribution

In [40]:
n = 5
name = 'fiveWordFreq.csv'
df5 = saveFregDist(n, name)

                                                    Frequency
Word                                                         
जुलाई से सभी स्कूल खुलेंगे                                  1
hello thaslima welcome to uolo                              1
thaslima welcome to uolo congratulations                    2
welcome to uolo congratulations for                     22525
to uolo congratulations for winning                     22525
...                                                       ...
https //vc service theuolo com/z/ittoe                      1
https //vc service theuolo com/z/jswzp/efbadead...          1
https //vc service theuolo com/z/jswzp                      1
https //vc service theuolo com/z/julc/efbadeadc...          1
https //vc service theuolo com/z/julc                       1

[2548390 rows x 1 columns]


### 6-gram distribution

In [41]:
n = 6
name = 'sixWordFreq.csv'
df6 = saveFregDist(n, name)

                                                    Frequency
Word                                                         
hello thaslima welcome to uolo congratulations              1
thaslima welcome to uolo congratulations for                2
welcome to uolo congratulations for winning             22525
to uolo congratulations for winning a                   22525
uolo congratulations for winning a gift                 22525
...                                                       ...
am https //vc service theuolo com/z/ittoe                   1
link https //vc service theuolo com/z/jswzp/efb...          1
am https //vc service theuolo com/z/jswzp                   1
link https //vc service theuolo com/z/julc/efba...          1
am https //vc service theuolo com/z/julc                    1

[2700730 rows x 1 columns]


### 7-gram distribution

In [42]:
n = 7
name = 'sevenWordFreq.csv'
df7 = saveFregDist(n, name)

                                                    Frequency
Word                                                         
hello thaslima welcome to uolo congratulations for          1
thaslima welcome to uolo congratulations for wi...          2
welcome to uolo congratulations for winning a           22525
to uolo congratulations for winning a gift              22525
uolo congratulations for winning a gift voucher         22525
...                                                       ...
aug am https //vc service theuolo com/z/ittoe               1
the link https //vc service theuolo com/z/jswzp...          1
aug am https //vc service theuolo com/z/jswzp               1
the link https //vc service theuolo com/z/julc/...          1
aug am https //vc service theuolo com/z/julc                1

[2740215 rows x 1 columns]


## Sorting Data

In [73]:
def sortDataByFileName(name):
    df1 = pd.read_csv('required_data/'+name)\
            .sort_values(by=['Frequency'], ascending=False)\
            .reset_index()[['Word', 'Frequency']]\
            .iloc[:1000000]

    df1.to_csv('sorted_data/'+ name)
    print(df1)

In [74]:
sortDataByFileName('singleWordFreq.csv')

                Word  Frequency
0              class     399806
1              https     399454
2             online     308624
3            theuolo     287383
4               dear     282670
...              ...        ...
284262  প্ৰদূষণমুক্ত          1
284263        ধোঁৱাক          1
284264       উদ্যোগৰ          1
284265          আগতে          1
284266    com/z/julc          1

[284267 rows x 2 columns]


In [75]:
sortDataByFileName('twoWordFreq.csv')

                        Word  Frequency
0               online class     219393
1            service theuolo     200165
2               //vc service     200165
3                 https //vc     200165
4        feedback assignment     182096
...                      ...        ...
963706      username deepakm          1
963707       apithgowdas stu          1
963708  username apithgowdas          1
963709         anveethsp stu          1
963710    theuolo com/z/julc          1

[963711 rows x 2 columns]


In [76]:
sortDataByFileName('threeWordFreq.csv')

                               Word  Frequency
0              //vc service theuolo     200165
1                https //vc service     200165
2               going start minutes      88115
3               clicking link https      88109
4                     also write us      87125
...                             ...        ...
999995                 big data mlm          1
999996   jayanthi assignment posted          1
999997            big data syllabus          1
999998         ready pencil collins          1
999999  test completed successfully          1

[1000000 rows x 2 columns]


In [77]:
sortDataByFileName('fourWordFreq.csv')

                                   Word  Frequency
0            https //vc service theuolo     200165
1                us support theuolo com      87097
2                  time time update new      87097
3              ms tio receive important      87097
4          tio receive important school      87097
...                                 ...        ...
999995      anthima r wishing happiness          1
999996       dear radhika mahesh rajput          1
999997    radhika mahesh rajput wishing          1
999998  mahesh rajput wishing happiness          1
999999    sukeerthi g wishing happiness          1

[1000000 rows x 2 columns]


In [78]:
sortDataByFileName('fiveWordFreq.csv')

                                     Word  Frequency
0                    is going to start in      89207
1                   you can also write to      87127
2                    can also write to us      87125
3                   app from time to time      87099
4                 features in the app you      87097
...                                   ...        ...
999995  minutes before with your notebook          2
999996      before with your notebook and          2
999997         with your notebook and pen          2
999998           your notebook and pen to          2
999999          notebook and pen to solve          2

[1000000 rows x 2 columns]


In [79]:
sortDataByFileName('sixWordFreq.csv')

                                         Word  Frequency
0                    you can also write to us      87125
1                   from time to time we will      87097
2       tio you will receive important school      87097
3              write to us at support theuolo      87097
4                 also write to us at support      87097
...                                       ...        ...
999995                  a note of this and be          2
999996                   july pm cae ab ba da          2
999997                   is july pm cae ab ba          2
999998                 date is july pm cae ab          2
999999        write again thank you chapter d          2

[1000000 rows x 2 columns]


In [80]:
sortDataByFileName('sevenWordFreq.csv')

                                                     Word  Frequency
0                    am ms tio you will receive important      87097
1                         new features in the app you can      87097
2                        to time we will update you about      87097
3                       time we will update you about new      87097
4                   we will update you about new features      87097
...                                                   ...        ...
999995   in minutes https //vc service theuolo com/z/cvts          2
999996  the link https //vc service theuolo com/zoch/d...          2
999997                    class vii a b sanskrit time aug          2
999998  the link https //vc service theuolo com/zoch/g...          2
999999  the link https //vc service theuolo com/zoch/i...          2

[1000000 rows x 2 columns]
