In [272]:
#Importing necessary libraries
import pandas as pd
import string 
import nltk
import multiprocessing

from gensim.models import Word2Vec
from time import time
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from gensim.models import KeyedVectors

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [273]:
#Importing dataset file- csv format
df = pd.read_csv("./Desktop/BBC News Train.csv")
df

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [274]:
#Checking articles per class
df["Category"].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [275]:
#Text Processing
from nltk import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer() 

#Function to remove punctuations and stop words, lemmatization and tokenization
def text_processing(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text_p)
    filtered_tokens = [word for word in words if word not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_words
    
df_text = df["Text"].apply(lambda x: text_processing(x))    

[nltk_data] Downloading package punkt to C:\Users\sana
[nltk_data]     khan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\sana
[nltk_data]     khan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\sana
[nltk_data]     khan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [276]:
df_text.head()

0    [worldcom, exboss, launch, defence, lawyer, de...
1    [german, business, confidence, slide, german, ...
2    [bbc, poll, indicates, economic, gloom, citize...
3    [lifestyle, governs, mobile, choice, faster, b...
4    [enron, boss, 168m, payout, eighteen, former, ...
Name: Text, dtype: object

In [277]:
df_class = df["Category"]
df_class.head()

0    business
1    business
2    business
3        tech
4    business
Name: Category, dtype: object

In [278]:
#Concating the Text and Category Columns(dataframes) to prepare training dataframe or training set
df_train = pd.concat([df_text, df_class], axis = 1)
df_train.head()

Unnamed: 0,Text,Category
0,"[worldcom, exboss, launch, defence, lawyer, de...",business
1,"[german, business, confidence, slide, german, ...",business
2,"[bbc, poll, indicates, economic, gloom, citize...",business
3,"[lifestyle, governs, mobile, choice, faster, b...",tech
4,"[enron, boss, 168m, payout, eighteen, former, ...",business


In [279]:
cores = multiprocessing.cpu_count()
print(cores)
# Counts the number of cores in a computer

8


In [280]:
tweetText = df_train['Text']
print(tweetText)

0       [worldcom, exboss, launch, defence, lawyer, de...
1       [german, business, confidence, slide, german, ...
2       [bbc, poll, indicates, economic, gloom, citize...
3       [lifestyle, governs, mobile, choice, faster, b...
4       [enron, boss, 168m, payout, eighteen, former, ...
                              ...                        
1485    [double, eviction, big, brother, model, capric...
1486    [dj, double, act, revamp, chart, show, dj, duo...
1487    [weak, dollar, hit, reuters, revenue, medium, ...
1488    [apple, ipod, family, expands, market, apple, ...
1489    [santy, worm, make, unwelcome, visit, thousand...
Name: Text, Length: 1490, dtype: object


In [282]:
# Training the Word2Vec model
t=time()
model = Word2Vec(tweetText, min_count=2,workers=7)       # Ignores all words with total absolute frequency lower than 2 and uses 7 worker threads to train the model
words =  model.wv.key_to_index                           

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 17:10:54: collecting all words and their counts
INFO - 17:10:54: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:10:54: collected 24834 word types from a corpus of 327187 raw words and 1490 sentences
INFO - 17:10:54: Creating a fresh vocabulary
INFO - 17:10:54: Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 14942 unique words (60.16751228154949%% of original 24834, drops 9892)', 'datetime': '2021-11-08T17:10:54.524547', 'gensim': '4.1.2', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
INFO - 17:10:54: Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 317295 word corpus (96.97665249536198%% of original 327187, drops 9892)', 'datetime': '2021-11-08T17:10:54.525578', 'gensim': '4.1.2', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_voca

Time to train the model: 0.02 mins


In [283]:
print(words)       #prints all the vocabulary found out in the word2vec model



In [267]:
# Finding word Vectors
vector = model.wv['entertainment']
print(vector)

[-0.2319754   0.4373964   0.3081828   0.20914842 -0.05362097 -0.70766836
 -0.09084617  0.93580574 -0.22394222 -0.25147    -0.07347605 -0.7065266
 -0.11898592  0.41720116 -0.02581927 -0.3174657  -0.13080119 -0.47943214
  0.1596204  -0.9931767   0.33016425  0.2776057   0.02142363 -0.0971847
 -0.1366145   0.15079534 -0.16040218 -0.15704189 -0.34443024  0.07541261
  0.30469948 -0.09794006  0.19442296 -0.20326497 -0.01473201  0.3660454
  0.11356224 -0.28347522 -0.14557323 -0.7845251   0.04706049 -0.4899026
 -0.17936523 -0.14666247  0.37373993 -0.37434885 -0.572218    0.21230896
  0.33112538  0.5377251   0.11068539 -0.1979758  -0.1953409  -0.10825908
 -0.24950066  0.24522915  0.22521502 -0.15483417 -0.5209057   0.25001556
  0.18703018  0.3737981  -0.26700208  0.07536666 -0.6956297   0.3855386
  0.08443395  0.35801092 -0.30445212  0.27930406 -0.2806987   0.39639232
  0.39830917 -0.12235455  0.49675643  0.23217477 -0.01622148  0.01332388
 -0.40230304  0.03690454 -0.07462896 -0.23640291 -0.1619

In [268]:
model.wv.most_similar(positive=["tech"])     #finds most similar words

[('operating', 0.9971630573272705),
 ('buy', 0.997015655040741),
 ('instead', 0.9963255524635315),
 ('chinese', 0.9962673187255859),
 ('large', 0.996232807636261),
 ('improving', 0.9962231516838074),
 ('amount', 0.9961767792701721),
 ('telecom', 0.9961448311805725),
 ('seeking', 0.9960784316062927),
 ('area', 0.9960241913795471)]

In [271]:
model.wv.similarity('apple', 'ipod')         #finds similarity between the two words

0.9966841