

## Task 3 and 4:


Word Embedding Models: Train and create word embedding models, including Word2Vec, GloVe, and FastText, on the same dataset.

Comparison of Similar Words: Compare words similar to the top 15 most frequent words identified in step 1 using the word embedding models.


In [30]:
import pandas as pd
import re
import string

#Preprocess the dtext data in the twitter file before implementing the embedding models
#We convert all the alphabets to lowercase as all wors will be consdiered equally and not with bias(for example if they have an upper case letter)
#Removed all the @ tags, # hashtags and urls
#removed all the special characters and Punctuations and tokenized them into words

def preprocess_tweet(tweet):

    tweet = tweet.lower()
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U0001F004-\U0001F0CF"  # Miscellaneous Symbols and Pictographs
                               u"\U0001F170-\U0001F251"  # Enclosed Alphanumeric Supplement
                               u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               "]+", flags=re.UNICODE)

    # Use the sub method to remove emojis
    tweet = emoji_pattern.sub(r'', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tokens = tweet.split()
    preprocessed_tweet = ' '.join(tokens)
    return preprocessed_tweet

data = pd.read_csv('twcs.csv')

data['text'] = data['text'].apply(preprocess_tweet)


In [31]:
data.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,i understand i would like to assist you we wou...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,i have sent several private messages and no on...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,please send us a private message so that we ca...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,i did,4.0,6.0


In [32]:
#Display the tokenized words
sentences = []
for text in data['text']:
    if isinstance(text, str):
        sentences.append(text.split())

In [33]:
sentences

[['i',
  'understand',
  'i',
  'would',
  'like',
  'to',
  'assist',
  'you',
  'we',
  'would',
  'need',
  'to',
  'get',
  'you',
  'into',
  'a',
  'private',
  'secured',
  'link',
  'to',
  'further',
  'assist'],
 ['and', 'how', 'do', 'you', 'propose', 'we', 'do', 'that'],
 ['i',
  'have',
  'sent',
  'several',
  'private',
  'messages',
  'and',
  'no',
  'one',
  'is',
  'responding',
  'as',
  'usual'],
 ['please',
  'send',
  'us',
  'a',
  'private',
  'message',
  'so',
  'that',
  'we',
  'can',
  'further',
  'assist',
  'you',
  'just',
  'click',
  '‘message’',
  'at',
  'the',
  'top',
  'of',
  'your',
  'profile'],
 ['i', 'did'],
 ['can',
  'you',
  'please',
  'send',
  'us',
  'a',
  'private',
  'message',
  'so',
  'that',
  'i',
  'can',
  'gain',
  'further',
  'details',
  'about',
  'your',
  'account'],
 ['is', 'the', 'worst', 'customer', 'service'],
 ['this',
  'is',
  'saddening',
  'to',
  'hear',
  'please',
  'shoot',
  'us',
  'a',
  'dm',
  'so',


In [34]:
text=data['text']

In [35]:
#Find the top frequency words from the dataset (done previously in the Ipynb file)
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words='english', lowercase=True, max_features=15)
fit_transform= cv.fit_transform(text)
features = cv.get_feature_names_out()
features
top_words = list(features)
top_words

['account',
 'dm',
 'email',
 'help',
 'hi',
 'im',
 'just',
 'know',
 'like',
 'look',
 'number',
 'send',
 'service',
 'sorry',
 'thanks']

In [38]:
import gensim.downloader as api

# Download and load the pre-trained GloVe model
glove_model = api.load("glove-wiki-gigaword-100")



In [40]:
glove_model.save("glove_model.bin")

In [15]:
#Create and save the word2vec and Fasttext model to google collab itself

from gensim.models import Word2Vec, KeyedVectors, FastText
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)
word2vec_model.save("word2vec.model")

# Train FastText model
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, sg=0)
fasttext_model.save("fasttext.model")

In [16]:
#Load the models saved
word2vec_model = Word2Vec.load("word2vec.model")
fasttext_model = FastText.load("fasttext.model")


In [44]:
#Search for similar words using the models that we trained to the ones that we got using count vectorizer (top 15)
similar_words = {}
for word in top_words:
    similar_words[word] = {
        "Word2Vec": word2vec_model.wv.most_similar(word),
        "FastText": fasttext_model.wv.most_similar(word),
        "Glove_model": glove_model.most_similar('word', topn=5)
    }

In [47]:
for word, similar in similar_words.items():   #display them
    print(f"Top similar words for '{word}':")
    print("Word2Vec:", similar["Word2Vec"])
    print("FastText:", similar["FastText"])
    #print("Glove_Model",similar["Glove_model"])
    print("\n")

#From the below given words we realise that the most freq and similar words are related to account, email, number, help, service, assisstance etc

Top similar words for 'account':
Word2Vec: [('booking', 0.8616283535957336), ('phone', 0.8609009385108948), ('order', 0.8507465124130249), ('name', 0.8491856455802917), ('email', 0.8430359959602356), ('address', 0.838095486164093), ('card', 0.8355706334114075), ('full', 0.83241868019104), ('acct', 0.8318496346473694), ('number', 0.8297367691993713)]
FastText: [('“account', 0.9992746710777283), ('accounti', 0.9955956339836121), ('myaccount', 0.9955276250839233), ('accountsq', 0.9945540428161621), ('accountshubz', 0.9944950342178345), ('accountit', 0.9910048246383667), ('accountsean', 0.9908014535903931), ('accounts', 0.9885589480400085), ('account’s', 0.9878804683685303), ('accountsee', 0.9875584840774536)]


Top similar words for 'dm':
Word2Vec: [('send', 0.8738312721252441), ('gamertag', 0.8673363327980042), ('postcode', 0.8499742746353149), ('message', 0.8447126746177673), ('contact', 0.8258100152015686), ('shoot', 0.8236198425292969), ('lexie', 0.8214563131332397), ('✉', 0.817937970

## Task 5:
Analysis from Three Perspectives: Analyze the output from three perspectives:
a) Context: Examine the context in which the words appear in the Twitter data.
b) Product: Identify any product or service-specific information present in the dataset.
c) Location: Determine whether there are geographic-specific details or references within the Twitter data.


In [19]:
#Examining the context in the dataset
context_sentences = []
for word in top_words:
    sentences = data[data['text'].str.contains(word, case=False)]['text'].tolist()
    context_sentences.extend(sentences)

In [20]:

context_sentences
#From the below results we can say that the context is more based on customer support for issues and concerns regardign various topics such as money, accounts etc
#Account assistance, billing and payment issues, technical suppport, security concerns etc

['can you please send us a private message so that i can gain further details about your account',
 'please send me a private message so that i can send you the link to access your account fr',
 'whenever i contact customer support they tell me i have shortcode enabled on my account but i have never in the 4 years ive tried',
 'the correct way to do it is via an ocs account takeover and email consent form it does not need to be done in a local office',
 'the information pertaining to the account assumption is correct this does need to be done at a local outlet wit',
 'what else can i provide they refuse to help me because they cannot validate the account',
 'we would not be able to verify anything without authenticating your account jay',
 'we can use the order number to locate the account but will need to do so in our secure one on one chat please follow and dm us hsb',
 'nobody can find my account or number i walked out of a store with this ive explained that they can find my acct vi

In [25]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Extract product or company names
product_names = []
for text in data['text']:
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PRODUCT':
            product_names.append(ent.text)

In [26]:
#From this we understand that the major products discussed here are about video games such as xbox
#alternatively there have been mentions about Scorpio and Nintendo switch too
product_names

['xbox one',
 'xbox',
 'xbox one',
 'xbox one',
 'scorpio',
 'tmt',
 'xbox',
 'xbox',
 'xbox',
 'xbox',
 'xbox',
 'xbox',
 'xbox one',
 'xbox',
 'tengo volando con',
 '737',
 'xbox one',
 'xbox one',
 'ep17',
 'xbox',
 'xbox one',
 'xbox one',
 'xbox',
 'xbox',
 'ballin shawn',
 'am60',
 'xbox',
 'max',
 'xbox one',
 '？ nintendo switchを購入したいんですが',
 '404s',
 'pero',
 'xbox',
 'xbox',
 'quesadilla',
 'xbox',
 'xbox one',
 'xbox',
 'xbox',
 'xbox one',
 'ilfh6',
 '⚾',
 'chasus33',
 'scorpio',
 'xbox',
 'xbox',
 '630p',
 'हैं',
 '？',
 '現在、そのandroidでメールが一切受信できなくなった、ということでよろしいしょうか',
 'mw2r',
 '？',
 'xbox one',
 'me25',
 '240p',
 'xbox',
 'sg22',
 'svuseason18',
 'kaye',
 'kaye',
 'am12 pm',
 'rs19',
 '232',
 '340',
 '747',
 'gqg43e',
 'thro',
 'rc18',
 'the2',
 'to23',
 '747s',
 '747',
 'das so',
 'das problem',
 'us12au',
 'बुकिंग',
 '200s',
 'diese',
 'ba206',
 'xbox',
 'xbox',
 'xbox',
 'xbox',
 'xbox one',
 'xbox one',
 'xbox',
 'xbox',
 '757',
 'ak36',
 '、、、',
 '767',
 'alexandria',
 'y

In [27]:
location_names = []
for text in data['text']:
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'GPE':  # GPE stands for geopolitical entity
            location_names.append(ent.text)

In [28]:
location_names

['mississippi',
 'the retina imac',
 'tanuj',
 'airasia',
 'airasia',
 'narnia amazon',
 'anna',
 'germany',
 'china',
 'n’t',
 'lexington',
 'aeromexico',
 'n’t',
 'delhi',
 'amsterdam',
 'london',
 'london',
 'london',
 'sacramento',
 'burrito',
 '✨',
 'burrito',
 'n’t',
 'burrito',
 'burrito',
 'quochuy046',
 'arizona',
 'us',
 'georgia',
 'apps fuccin',
 'germany',
 'hurricane irma',
 'san francisco',
 'london',
 'seattle',
 'singh please',
 'hai',
 'singapore',
 'noooooooooo',
 'rechaza',
 'tanuj',
 'tanuj',
 'tanuj',
 'us',
 'us',
 'london',
 'america',
 '’s',
 'pittsburgh',
 'd7',
 'canada',
 'france',
 'australia',
 'n’t',
 'lima',
 'amsterdam',
 'paris',
 'birmingham',
 'london',
 'n’t',
 'n’t',
 'n’t',
 'n’t',
 'afc',
 'bronx',
 'us',
 'london',
 'uk',
 'uk',
 'uk',
 'alaska',
 'c’m',
 'miami',
 'charlotte',
 'int’l',
 'n’t',
 'möchte',
 'los detalles',
 'germany',
 'geneva',
 'mexico city',
 'washington',
 'france',
 'vermont',
 'central jersey',
 'kuwait',
 'kuwait',
 'kuwa

From the above 2 tasks we can analyze that the dataset us about tweets regarding customer service. Customers ask for help mostly with their accounts, money and refunds, games such as xbox, various locations in the world but majorly US states and Europe.