In [35]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk
import matplotlib
from matplotlib import pyplot as plt

In [4]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [5]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    # replacing the punctuations with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)

In [6]:
train['text'] = train['text'].apply(remove_punctuation)
train['text'].head(10)

0               Info has been found  100 pages and ...
1               These are the team members   Drewes...
2               In het kader van kernfusie op aarde...
3                           testing  testing          
4                 Thanks to Yahoos Toolbar I can no...
5                 I had an interesting conversation...
6                 Somehow CocaCola has a way of sum...
7                 If anything Korea is a country of...
8                 Take a read of this news article ...
9                 I surf the English news sites a l...
Name: text, dtype: object

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
sw = stopwords.words('english')

In [10]:
def stopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)

In [11]:
train['text'] = train['text'].apply(stopwords)
train.head(10)

Unnamed: 0,post.id,user.id,gender,topic,sign,date,text,age
0,1,11869,male,Student,Leo,"14,May,2004",info found 100 pages 45 mb pdf files wait unti...,15
1,2,11869,male,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...,15
2,3,11869,male,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,15
3,4,11869,male,Student,Leo,"12,May,2004",testing testing,15
4,5,16332,male,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,33
5,6,16332,male,InvestmentBanking,Aquarius,"10,June,2004",interesting conversation dad morning talking k...,33
6,7,16332,male,InvestmentBanking,Aquarius,"10,June,2004",somehow cocacola way summing things well early...,33
7,8,16332,male,InvestmentBanking,Aquarius,"10,June,2004",anything korea country extremes everything see...,33
8,9,16332,male,InvestmentBanking,Aquarius,"10,June,2004",take read news article urllink joongang ilbo n...,33
9,10,16332,male,InvestmentBanking,Aquarius,"09,June,2004",surf english news sites lot looking tidbits ko...,33


In [15]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [17]:
train['text'] = train['text'].apply(stemming)
train.head(10)

Unnamed: 0,post.id,user.id,gender,topic,sign,date,text,age
0,1,11869,male,Student,Leo,"14,May,2004",info found 100 page 45 mb pdf file wait until ...,15
1,2,11869,male,Student,Leo,"13,May,2004",team member drew van der laag urllink mail rui...,15
2,3,11869,male,Student,Leo,"12,May,2004",het kader van kernfusi op aard maak je eigen w...,15
3,4,11869,male,Student,Leo,"12,May,2004",test test,15
4,5,16332,male,InvestmentBanking,Aquarius,"11,June,2004",thank yahoo toolbar captur url popupswhich mea...,33
5,6,16332,male,InvestmentBanking,Aquarius,"10,June,2004",interest convers dad morn talk korean put mone...,33
6,7,16332,male,InvestmentBanking,Aquarius,"10,June,2004",somehow cocacola way sum thing well earli 1970...,33
7,8,16332,male,InvestmentBanking,Aquarius,"10,June,2004",anyth korea countri extrem everyth seem fadbas...,33
8,9,16332,male,InvestmentBanking,Aquarius,"10,June,2004",take read news articl urllink joongang ilbo no...,33
9,10,16332,male,InvestmentBanking,Aquarius,"09,June,2004",surf english news site lot look tidbit korea f...,33


In [13]:
#def length(text):    
 #   '''a function which returns the length of text'''
 #   return len(text)

In [15]:
#train['length'] = train['text'].apply(length)
#train.head(10)

Unnamed: 0,post.id,user.id,gender,topic,sign,date,text,age,length
0,1,11869,male,Student,Leo,"14,May,2004",info found 100 pages 45 mb pdf files wait unti...,15,82
1,2,11869,male,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...,15,97
2,3,11869,male,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,15,17165
3,4,11869,male,Student,Leo,"12,May,2004",testing testing,15,15
4,5,16332,male,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...,33,244
5,6,16332,male,InvestmentBanking,Aquarius,"10,June,2004",interesting conversation dad morning talking k...,33,2244
6,7,16332,male,InvestmentBanking,Aquarius,"10,June,2004",somehow cocacola way summing things well early...,33,688
7,8,16332,male,InvestmentBanking,Aquarius,"10,June,2004",anything korea country extremes everything see...,33,1440
8,9,16332,male,InvestmentBanking,Aquarius,"10,June,2004",take read news article urllink joongang ilbo n...,33,1498
9,10,16332,male,InvestmentBanking,Aquarius,"09,June,2004",surf english news sites lot looking tidbits ko...,33,636


In [39]:
# create the object of tfid vectorizer
tfid_vectorizer = TfidfVectorizer("english")
# fit the vectorizer using the text data
tfid_vectorizer.fit(train['text'])
# collect the vocabulary items used in the vectorizer
dictionary = tfid_vectorizer.vocabulary_.items()  

In [40]:
train.to_csv("train_sklearn_intermediate.csv", encoding = 'utf-8', index = False)

In [41]:
tfid_matrix = tfid_vectorizer.transform(train['text'])
#collect the tfid matrix in numpy array

In [43]:
tfid_matrix

<442961x942713 sparse matrix of type '<class 'numpy.float64'>'
	with 33963671 stored elements in Compressed Sparse Row format>