## Task 1 :
Utilize a count vectorizer to identify the 15 most frequently occurring words in the text column. You may incorporate text preprocessing techniques.



In [11]:
#Import the necessary libraries
import nltk
import pandas as pd

In [12]:
#read the dataset
tweet=pd.read_csv("twcs.csv")

In [13]:
#show the first 5 rows of the dataset
tweet.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [14]:
#Load the text column from the twitter dataset
text=tweet['text']

In [5]:
#Load the count vectorizer from sklearn library
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
#Create an instance of the CountVectorizer class from sklearn
#We remove the stop words usign all the predefined stop words in english as they are of less significance
#We convert all the alphabets to lower case as all the words irrespective of their case should be treated equally.
#We are specifying maximum number of features to be 15 to consider the top 15 most imp words
#Below we have set the configuration of the Count_vectorizer to the terms defined above
cv=CountVectorizer(stop_words='english', lowercase=True, max_features=15)
fit_transform= cv.fit_transform(text)

In [7]:
#Count vectorizer does 3 things: Tokenization, Counting and Vectorization
#Tokenization: It breaks down words in a sentence into tokens/unique words
#Counting: It counts the frequency of each word in the text/sentence
#Vectorization: It represents the text data in a matrix form where
#rows represents the occurance of words and columns represents the words in the documents/sentences

features = cv.get_feature_names_out()
features

array(['amazonhelp', 'dm', 'help', 'hi', 'https', 'just', 'know', 'let',
       'like', 'll', 'look', 'send', 'service', 'sorry', 'thanks'],
      dtype=object)

In [8]:
#We are counting the word frequency along the columns and converting the matrix to a 1d array
word_counts = fit_transform.sum(axis=0).A1

In [9]:
#Create a dataframe with the frequency and words
df= pd.DataFrame({'Word': features, 'Count': word_counts})
df.head()

Unnamed: 0,Word,Count
0,amazonhelp,137434
1,dm,345837
2,help,274243
3,hi,226026
4,https,654412


In [10]:
#Sort all the words in descendign order of frequency
result_df = df.sort_values(by='Count', ascending=False)

In [11]:
result1 = result_df.head(15)

In [12]:
#display the top 15 words
print(result1)

          Word   Count
4        https  654412
1           dm  345837
2         help  274243
3           hi  226026
14      thanks  211116
13       sorry  193119
9           ll  168289
5         just  149194
8         like  146766
6         know  146007
7          let  141743
10        look  139986
11        send  139250
0   amazonhelp  137434
12     service  135554


### Task 2:
Top 50 Most Frequent Words (TF-IDF Model): Construct a TF-IDF model to determine the top 50 most frequent words. Compare these results with those obtained using the count vectorizer.

In [13]:
#We repeat the steps followed above using TD IDF model
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tdidf=TfidfVectorizer(stop_words='english',lowercase=True,max_features=50)
fit_transform=tdidf.fit_transform(text)

In [15]:
features=tdidf.get_feature_names_out()

In [16]:
tfidfs=fit_transform.sum(axis=0).A1

In [17]:
result = pd.DataFrame({'Word': features, 'Score': tfidfs})

# Sort the DataFrame by TF-IDF score in descending order
result= result.sort_values(by='Score', ascending=False)

# Get the top 50 most frequent words
result2 = result.head(50)

print(result2)

            Word          Score
23         https  320024.873735
12            dm  143576.869436
20          help  133863.846550
43        thanks  127535.062382
22            hi  126032.605542
39         sorry  102089.680954
2     amazonhelp  101153.193129
25          just   92558.989824
29            ll   86880.718618
28          like   82309.202971
38       service   80722.310407
26          know   76504.320351
42         thank   73550.672776
27           let   72812.192917
5   applesupport   71933.710684
44          time   70224.870648
30          look   69437.286217
37          send   69031.024308
0        account   66994.891533
47            ve   66738.004886
36         phone   63460.902010
34        number   62451.057182
32          need   62032.567559
24         issue   60242.478124
41          team   58305.323627
13           don   57987.743477
21           hey   57201.461507
7          check   54521.819914
3            amp   53054.768247
35         order   52629.004086
14      

In [27]:
result2

Unnamed: 0,Word,Score
23,https,320024.873735
12,dm,143576.869436
20,help,133863.84655
43,thanks,127535.062382
22,hi,126032.605542
39,sorry,102089.680954
2,amazonhelp,101153.193129
25,just,92558.989824
29,ll,86880.718618
28,like,82309.202971


In [35]:
#As we can see here the resultant datafram containing both the pevious dataframes obtained from the Count Vectroizer and TD-IDF are compared
#We conclude that the results have slight variations when using TD-IDF since the model takes the term freq and how unique the word is across each document unlike countvectorizer which only looks at term frequencies across the all documents
#TD-IDF finds content specific words which are important in each document, hence depending on a particular task we can decide if TD-IDF or Count vextorizer is better

result1.reset_index(drop=True, inplace=True)
result2.reset_index(drop=True, inplace=True)

# Concatenate DataFrames along columns (axis=1)
final= pd.concat([result1, result2], axis=1)
final.head(10)

Unnamed: 0,Word,Count,Word.1,Score
0,https,654412.0,https,320024.873735
1,dm,345837.0,dm,143576.869436
2,help,274243.0,help,133863.84655
3,hi,226026.0,thanks,127535.062382
4,thanks,211116.0,hi,126032.605542
5,sorry,193119.0,sorry,102089.680954
6,ll,168289.0,amazonhelp,101153.193129
7,just,149194.0,just,92558.989824
8,like,146766.0,ll,86880.718618
9,know,146007.0,like,82309.202971
