In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
import nltk
import matplotlib
from matplotlib import pyplot as plt
import string

In [2]:
train=pd.read_csv('train.csv')

In [3]:
train_count=train.groupby(['user.id'])["user.id"].size().reset_index(name='Count')
train_count=train_count['Count']

In [4]:
train=train.groupby(['user.id','age'])["text"].apply(lambda x: "%s" % ' '.join(x)).reset_index(name='text')

In [5]:
train_id=train['user.id']

In [6]:
train.head(10)

Unnamed: 0,user.id,age,text
0,1,17,Doritos Fuel of Space urlLink : Ian My...
1,3,16,"Yeah, I set up this blog so Heather w..."
2,4,17,Not too much exciting has happened ...
3,5,47,European Pilgrimage For High sch...
4,6,16,Last night's dream is very blurry. T...
5,7,25,"June 21, 2004 SPAIN V P..."
6,12,23,"Well, 1 day left... Less than 36 ..."
7,14,37,Every now and then you see a post on one of...
8,15,17,Well around 3 me adam and jake went ou...
9,16,15,"yesterday,went out wif my sis..."


In [7]:
def count_punct(text):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    s=count(text,set(string.punctuation))                                                                                                      
    return s

In [8]:
train['punct_count']=train['text'].apply(count_punct)
train['punct_count']=train.iloc[:,3].div(train_count,axis=0)
train['punct_count']=train.iloc[:,3].div(train.iloc[:,3].max(),axis=0)

In [9]:
train.head()

Unnamed: 0,user.id,age,text,punct_count
0,1,17,Doritos Fuel of Space urlLink : Ian My...,0.003312
1,3,16,"Yeah, I set up this blog so Heather w...",0.022772
2,4,17,Not too much exciting has happened ...,0.025595
3,5,47,European Pilgrimage For High sch...,0.017341
4,6,16,Last night's dream is very blurry. T...,0.015028


In [10]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    # replacing the punctuations with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)
train['text'] = train['text'].apply(remove_punctuation)

In [11]:
nltk.download('stopwords')
sw = stopwords.words('english')
def stopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)
train['text'] = train['text'].apply(stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 
train['text'] = train['text'].apply(stemming)
train.head(10)

In [None]:
train.to_csv("train_clean.csv", encoding = 'utf-8', index = False)

In [None]:
#new
count_vect = CountVectorizer(max_features=200)
X_train_counts = count_vect.fit_transform(train.text)
X_train_counts.shape

In [None]:
#new
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
train=train.drop(['user.id','text'],axis=1)

In [None]:
tfidf_df = pd.DataFrame(X_train_tfidf.todense())

In [None]:
train_total=train.join(tfidf_df,how='right')

In [None]:
train_total.head()

In [None]:
train_age=train_total['age']

In [None]:
train_total=train_total.drop('age',axis=1)

In [None]:
clf = MultinomialNB().fit(train_total, train_age)

In [None]:
test=pd.read_csv('test.csv')

In [None]:
test_topic=test[['user.id','topic']]
test_topic=pd.concat([test_topic,pd.get_dummies(test_topic['topic'])],axis=1)
test_topic=test_topic.groupby('user.id').sum().reset_index()
test_id=test_topic['user.id']
test_count=test.groupby(['user.id'])["user.id"].size().reset_index(name='Count')
test_topic=pd.merge(test_topic,test_count,how='left',on='user.id')
test_merge=test_topic.iloc[:,1:41].div(test_topic.iloc[:,-1],axis=0)
test_merge=test_merge.iloc[:,0:41].div(1000,axis=0)
test_count=test_count['Count']
test_count=test_count.div(test_count.max(),axis=0)
test_merge=test_merge.join(test_id,how='right')
test_merge=test_merge.join(test_count,how='left')
test=test.groupby('user.id')["text"].apply(lambda x: "%s" % ' '.join(x)).reset_index(name='text')
test=pd.merge(test,test_merge,how='left',on='user.id')
test.head()

In [None]:
test['text'] = test['text'].apply(stopwords)
test['text'] = test['text'].apply(stemming)
test.head(10)

In [None]:
X_test_counts = count_vect.fit_transform(test.text)
X_test_counts.shape
tfidf_transformer = TfidfTransformer()
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
X_test_tfidf.shape

In [None]:
test.to_csv("test_clean.csv", encoding = 'utf-8', index = False)

In [None]:
test=test.drop('Count',axis=1)

In [None]:
test_2=test

In [None]:
test=test.drop(['user.id','text'],axis=1)
tfidf_df = pd.DataFrame(X_test_tfidf.todense())
test_total=test.join(tfidf_df,how='right')
test_total.head()

In [None]:
predicted = clf.predict(test_total)

In [None]:
cols = ["user.id", "age"]
test_output = pd.DataFrame(columns = cols)
test_output[["user.id"]] = test_2[["user.id"]]
test_output[["age"]] = predicted

In [None]:
test_output.head()

In [None]:
# train.to_csv("train_FRE.csv", encoding = 'utf-8', index = False)
test_output.to_csv("test_sklearn_output.csv", encoding = 'utf-8', index = False)

In [None]:
#def length(text):    
 #   '''a function which returns the length of text'''
 #   return len(text)

In [None]:
#train['length'] = train['text'].apply(length)
#train.head(10)

In [None]:
# create the object of tfid vectorizer
tfid_vectorizer = TfidfVectorizer("english")
# fit the vectorizer using the text data
tfid_vectorizer.fit(train['text'])
# collect the vocabulary items used in the vectorizer
dictionary = tfid_vectorizer.vocabulary_.items()  

In [None]:
train.to_csv("train_sklearn_intermediate.csv", encoding = 'utf-8', index = False)

In [None]:
tfid_matrix = tfid_vectorizer.transform(train['text'])
#collect the tfid matrix in numpy array

In [None]:
tfid_matrix