In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_punctuation, stem_text

import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

import spacy

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Working on raw dataset

In [2]:
dataset = pd.read_csv("./blogtext.csv", encoding="latin1")
df = dataset[['text', 'topic']][0:50000]

print (df.shape)
df

(50000, 2)


Unnamed: 0,text,topic
0,"Info has been found (+/- 100 pages,...",Student
1,These are the team members: Drewe...,Student
2,In het kader van kernfusie op aarde...,Student
3,testing!!! testing!!!,Student
4,Thanks to Yahoo!'s Toolbar I can ...,InvestmentBanking
...,...,...
49995,Aug 7th Thur... Bought Her Mua Chee & S...,Advertising
49996,Aug 6th Wed.. Her 1st Day @ Work Back @...,Advertising
49997,Aug 4th Mon Zing's BD !! Went To Her Pl...,Advertising
49998,Aug 3rd Sun.. Went To Her Place B4 Goin...,Advertising


In [3]:
print (type(df))
print (len(list(df.topic.unique())))
list(df.topic.unique())

<class 'pandas.core.frame.DataFrame'>
40


['Student',
 'InvestmentBanking',
 'indUnk',
 'Non-Profit',
 'Banking',
 'Education',
 'Engineering',
 'Science',
 'Communications-Media',
 'BusinessServices',
 'Sports-Recreation',
 'Arts',
 'Internet',
 'Museums-Libraries',
 'Accounting',
 'Technology',
 'Law',
 'Consulting',
 'Automotive',
 'Religion',
 'Fashion',
 'Publishing',
 'Marketing',
 'LawEnforcement-Security',
 'HumanResources',
 'Telecommunications',
 'Military',
 'Government',
 'Transportation',
 'Architecture',
 'Advertising',
 'Agriculture',
 'Biotech',
 'RealEstate',
 'Manufacturing',
 'Construction',
 'Chemicals',
 'Maritime',
 'Tourism',
 'Environment']

In [4]:
duplicate = df[df.duplicated()]
duplicate

Unnamed: 0,text,topic
762,urlLink resume,BusinessServices
764,urlLink resume help,BusinessServices
2310,hey guys - i had the flu today - th...,Technology
3469,,Technology
3578,,Technology
...,...,...
49299,La la la! La la la! La la la!...,Law
49571,I'm bored. Bite me. ~*~ Fi...,Student
49640,so im sitting at carol's house with her...,Military
49652,so this morning i didn't go to bed 'til...,Military


In [5]:
df.drop_duplicates(keep='first', inplace=True)
df.shape

(49407, 2)

In [6]:
#Checking for any more duplicates (returns an empty series)
duplicate1 = df[df.duplicated()]
duplicate1

Unnamed: 0,text,topic


In [7]:
print (len(list(df.topic.unique())))
list(df.topic.unique())

40


['Student',
 'InvestmentBanking',
 'indUnk',
 'Non-Profit',
 'Banking',
 'Education',
 'Engineering',
 'Science',
 'Communications-Media',
 'BusinessServices',
 'Sports-Recreation',
 'Arts',
 'Internet',
 'Museums-Libraries',
 'Accounting',
 'Technology',
 'Law',
 'Consulting',
 'Automotive',
 'Religion',
 'Fashion',
 'Publishing',
 'Marketing',
 'LawEnforcement-Security',
 'HumanResources',
 'Telecommunications',
 'Military',
 'Government',
 'Transportation',
 'Architecture',
 'Advertising',
 'Agriculture',
 'Biotech',
 'RealEstate',
 'Manufacturing',
 'Construction',
 'Chemicals',
 'Maritime',
 'Tourism',
 'Environment']

In [8]:
#Losing the rows where topic="indUnk"

df.drop(df[df['topic']=="indUnk"].index, inplace=True)
print (len(list(df.topic.unique())))
df

39


Unnamed: 0,text,topic
0,"Info has been found (+/- 100 pages,...",Student
1,These are the team members: Drewe...,Student
2,In het kader van kernfusie op aarde...,Student
3,testing!!! testing!!!,Student
4,Thanks to Yahoo!'s Toolbar I can ...,InvestmentBanking
...,...,...
49995,Aug 7th Thur... Bought Her Mua Chee & S...,Advertising
49996,Aug 6th Wed.. Her 1st Day @ Work Back @...,Advertising
49997,Aug 4th Mon Zing's BD !! Went To Her Pl...,Advertising
49998,Aug 3rd Sun.. Went To Her Place B4 Goin...,Advertising


In [9]:
#Changing the index to normal
df.reset_index(inplace=True)
df = df[['text', 'topic']]
df

Unnamed: 0,text,topic
0,"Info has been found (+/- 100 pages,...",Student
1,These are the team members: Drewe...,Student
2,In het kader van kernfusie op aarde...,Student
3,testing!!! testing!!!,Student
4,Thanks to Yahoo!'s Toolbar I can ...,InvestmentBanking
...,...,...
32001,Aug 7th Thur... Bought Her Mua Chee & S...,Advertising
32002,Aug 6th Wed.. Her 1st Day @ Work Back @...,Advertising
32003,Aug 4th Mon Zing's BD !! Went To Her Pl...,Advertising
32004,Aug 3rd Sun.. Went To Her Place B4 Goin...,Advertising


# Cleaning previously processed dataset

In [10]:
#Found a list of words online, going to use it to remove all useless words
words = open('Words.txt')
word = []
for line in words:
    word.append(line.strip())

print (len(word))
word

113850


['aa',
 'aah',
 'aahed',
 'aahing',
 'aahs',
 'aal',
 'aalii',
 'aaliis',
 'aals',
 'aardvark',
 'aardvarks',
 'aardwolf',
 'aardwolves',
 'aas',
 'aasvogel',
 'aasvogels',
 'aba',
 'abaca',
 'abacas',
 'abaci',
 'aback',
 'abacus',
 'abacuses',
 'abaft',
 'abaka',
 'abakas',
 'abalone',
 'abalones',
 'abamp',
 'abampere',
 'abamperes',
 'abamps',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abandonments',
 'abandons',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasement',
 'abasements',
 'abaser',
 'abasers',
 'abases',
 'abash',
 'abashed',
 'abashes',
 'abashing',
 'abasing',
 'abatable',
 'abate',
 'abated',
 'abatement',
 'abatements',
 'abater',
 'abaters',
 'abates',
 'abating',
 'abatis',
 'abatises',
 'abator',
 'abators',
 'abattis',
 'abattises',
 'abattoir',
 'abattoirs',
 'abaxial',
 'abaxile',
 'abbacies',
 'abbacy',
 'abbatial',
 'abbe',
 'abbes',
 'abbess',
 'abbesses',
 'abbey',
 'abbeys',
 'abbot',
 'abbotcies',
 'abbotcy',
 'abbots',
 'abbreviate',
 '

In [11]:
#Created a list of custom stopwords from the one I made here as it wasn't helping directly

stops = open('Stopwords.txt')
stop = []

for line in stops:
    stop.append(line.strip())

print (len(stop))
stop

349


['a',
 'about',
 'above',
 'absolutely',
 'actually',
 'after',
 'again',
 'against',
 'ah',
 'ain',
 'aint',
 "ain't",
 'all',
 'also',
 'although',
 'always',
 'am',
 'an',
 'and',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anyway',
 'anywho',
 'are',
 'aren',
 "aren't",
 'around',
 'as',
 'at',
 'away',
 'b',
 'back',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'c',
 'can',
 'certainly',
 'come',
 'could',
 'coulda',
 'couldn',
 "couldn't",
 'd',
 'day',
 'definitely',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'e',
 'each',
 'eh',
 'else',
 'even',
 'ever',
 'every',
 'f',
 'feel',
 'few',
 'for',
 'from',
 'further',
 'g',
 'get',
 'give',
 'go',
 'goes',
 'going',
 'gon',
 "gon'",
 'gonna',
 'got',
 'gotta',
 'h',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'hello',
 '

In [12]:
#Trying a different approach to stopword-removal by creating a new wordlist containing all valid words except stopwords

print (len(word), len(stop))
print (len(word) - len(stop))

for i in word:
    a = i.strip()
    if a in stop:
        word.remove(i)
print (len(word))

#(len(word)) > (len(word) - len(stop)) because stop contains some words not in word

113850 349
113501
113614


In [13]:
#Checking for any stopwords still present in word
"then" in word

False

In [14]:
df[['text']] = df[['text']].astype(str)
df

Unnamed: 0,text,topic
0,"Info has been found (+/- 100 pages,...",Student
1,These are the team members: Drewe...,Student
2,In het kader van kernfusie op aarde...,Student
3,testing!!! testing!!!,Student
4,Thanks to Yahoo!'s Toolbar I can ...,InvestmentBanking
...,...,...
32001,Aug 7th Thur... Bought Her Mua Chee & S...,Advertising
32002,Aug 6th Wed.. Her 1st Day @ Work Back @...,Advertising
32003,Aug 4th Mon Zing's BD !! Went To Her Pl...,Advertising
32004,Aug 3rd Sun.. Went To Her Place B4 Goin...,Advertising


In [15]:
lemmatizer = WordNetLemmatizer()
clean_text = []

def preprocess(text):   
    #New stopwords function, tokenize and lemmatize (remove punctuation before tokenization)
    text = text.lower()
    text = strip_punctuation(text) 
    text = strip_numeric(text)
    text = word_tokenize(text)
    text = [w for w in text if w in word]    
    text = [a for a in text if a not in stop]
    text = " ".join(text)
    text = lemmatizer.lemmatize(text)

    clean_text.append(text)
    return text

egtext = "Hello, aaaah then this is 1 trial of the above function oh hrfjnfjm!"

print (preprocess(egtext)) #TRIAL SUCCESSFUL

trial function


In [16]:
df1 = pd.DataFrame(df['text'])
df1[['text']] = df1[['text']].astype(str)

In [17]:
df1.shape
df1

Unnamed: 0,text
0,"Info has been found (+/- 100 pages,..."
1,These are the team members: Drewe...
2,In het kader van kernfusie op aarde...
3,testing!!! testing!!!
4,Thanks to Yahoo!'s Toolbar I can ...
...,...
32001,Aug 7th Thur... Bought Her Mua Chee & S...
32002,Aug 6th Wed.. Her 1st Day @ Work Back @...
32003,Aug 4th Mon Zing's BD !! Went To Her Pl...
32004,Aug 3rd Sun.. Went To Her Place B4 Goin...


In [18]:
df

Unnamed: 0,text,topic
0,"Info has been found (+/- 100 pages,...",Student
1,These are the team members: Drewe...,Student
2,In het kader van kernfusie op aarde...,Student
3,testing!!! testing!!!,Student
4,Thanks to Yahoo!'s Toolbar I can ...,InvestmentBanking
...,...,...
32001,Aug 7th Thur... Bought Her Mua Chee & S...,Advertising
32002,Aug 6th Wed.. Her 1st Day @ Work Back @...,Advertising
32003,Aug 4th Mon Zing's BD !! Went To Her Pl...,Advertising
32004,Aug 3rd Sun.. Went To Her Place B4 Goin...,Advertising


In [19]:
(i, j) = df.shape
i, j

(32006, 2)

In [23]:
(i, j) = df1.shape
print (i)

cleantext = []

for x in range(i):
    cleantext.append(preprocess(df1._get_value(x, "text")))    #WORKS BUT TAKES VERY LONG TO RUN (Around 4 hrs)
cleantext[3]

#Started at 8:56, ended at 1:20

32006


'testing testing'

In [31]:
print (len(cleantext))
ct = pd.DataFrame(cleantext)
ct    #CLEAN DATAFRAME

32006


Unnamed: 0,0
0,info found pages files wait team leader proces...
1,team members van mail mail mail
2,het van op build bomb rec humor subject build ...
3,testing testing
4,thanks yahoo capture means show cool links pop...
...,...
32001,bought send home work
32002,wed work sent work sent
32003,mon zing place cooked dinner together finally ...
32004,sun place zing present raffles centre double s...


In [37]:
#Add the topics

cleantext_topics = ct.join(df['topic'])
cleantext_topics

Unnamed: 0,0,topic
0,info found pages files wait team leader proces...,Student
1,team members van mail mail mail,Student
2,het van op build bomb rec humor subject build ...,Student
3,testing testing,Student
4,thanks yahoo capture means show cool links pop...,InvestmentBanking
...,...,...
32001,bought send home work,Advertising
32002,wed work sent work sent,Advertising
32003,mon zing place cooked dinner together finally ...,Advertising
32004,sun place zing present raffles centre double s...,Advertising


In [38]:
type(cleantext_topics)

pandas.core.frame.DataFrame

In [39]:
cleantext_topics.to_csv('Cleaned_Blog_Dataset.csv', index=False)

In [40]:
df.to_csv('Raw_Blog_Dataset.csv', index=False)

In [46]:
all_text_topics = pd.DataFrame(df['text']).join(cleantext_topics)
all_text_topics.rename(columns = {0:'clean_text'}, inplace = True)
all_text_topics.rename(columns = {'text':'raw_text'}, inplace = True)
all_text_topics

Unnamed: 0,raw_text,clean_text,topic
0,"Info has been found (+/- 100 pages,...",info found pages files wait team leader proces...,Student
1,These are the team members: Drewe...,team members van mail mail mail,Student
2,In het kader van kernfusie op aarde...,het van op build bomb rec humor subject build ...,Student
3,testing!!! testing!!!,testing testing,Student
4,Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo capture means show cool links pop...,InvestmentBanking
...,...,...,...
32001,Aug 7th Thur... Bought Her Mua Chee & S...,bought send home work,Advertising
32002,Aug 6th Wed.. Her 1st Day @ Work Back @...,wed work sent work sent,Advertising
32003,Aug 4th Mon Zing's BD !! Went To Her Pl...,mon zing place cooked dinner together finally ...,Advertising
32004,Aug 3rd Sun.. Went To Her Place B4 Goin...,sun place zing present raffles centre double s...,Advertising


In [69]:
all_text_topics['clean_text'].isnull().any()     #Don't know why this doesn't give the right answer

False

In [65]:
all_text_topics.to_csv('Mixed_Text_Blog_Dataset.csv', index=False)

In [75]:
dataset_mixed = pd.read_csv('Mixed_Text_Blog_Dataset.csv')

print (dataset_mixed.shape)
print (dataset_mixed.index)
print (dataset_mixed.columns)
print()

print (dataset_mixed["clean_text"].isnull().any())
print (dataset_mixed["clean_text"].isnull().sum())
print (dataset_mixed['clean_text'].notnull().sum())
print()

(32006, 3)
RangeIndex(start=0, stop=32006, step=1)
Index(['raw_text', 'clean_text', 'topic'], dtype='object')

True
508
31498



In [79]:
#Dropping rows where clean_text=null

dataset_mixed = dataset_mixed.dropna(subset=['clean_text'])
dataset_mixed

Unnamed: 0,raw_text,clean_text,topic
0,"Info has been found (+/- 100 pages,...",info found pages files wait team leader proces...,Student
1,These are the team members: Drewe...,team members van mail mail mail,Student
2,In het kader van kernfusie op aarde...,het van op build bomb rec humor subject build ...,Student
3,testing!!! testing!!!,testing testing,Student
4,Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo capture means show cool links pop...,InvestmentBanking
...,...,...,...
32001,Aug 7th Thur... Bought Her Mua Chee & S...,bought send home work,Advertising
32002,Aug 6th Wed.. Her 1st Day @ Work Back @...,wed work sent work sent,Advertising
32003,Aug 4th Mon Zing's BD !! Went To Her Pl...,mon zing place cooked dinner together finally ...,Advertising
32004,Aug 3rd Sun.. Went To Her Place B4 Goin...,sun place zing present raffles centre double s...,Advertising


In [80]:
dataset_mixed.to_csv('Final_Blog_Dataset.csv', index=False)