In [1]:
import numpy as np
from matplotlib import pyplot as plt
from numpy import random as rnd
import warnings,os,datetime,time

import torch
import pandas as pd

from keras.models import Sequential,Model
from keras.layers import Dense,BatchNormalization,Dropout,LSTM,Concatenate,Activation,Input
from keras.preprocessing.text import Tokenizer

from bs4 import BeautifulSoup

from nltk import sent_tokenize

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
basedir = '..//..//..//TF_data//BLOGENTRIES//blogs//'

files = os.listdir(basedir)[:250]

df = pd.DataFrame()
for c_file in files:
    
    try:
        with open(basedir+c_file) as fp: soup = BeautifulSoup(fp, 'xml')
    except:
        continue
    cells = soup.findAll(['date','post'])

    data = []
    for k in range(0,len(cells),2):
        data.append([
            str(cells[k]).replace('<date>','').replace('</date>','').replace(',','-'),
            str(cells[k+1]).replace('<post>','').replace('</post>','').replace('\n','').replace('\t','').lstrip().rstrip()
        ])
    cf = pd.DataFrame(data).rename(columns={0:'Date Post',1:'Text Post'})
    c_ID,c_Gender,c_Age,c_Topic,c_AstroSign = c_file.split('.')[:-1]

    cf['ID Poster'] = c_ID
    cf['Gender Poster'] = c_Gender
    cf['Age Poster'] = c_Age
    cf['Topic Channel'] = c_Topic
    cf['Astro Sign Poster'] = c_AstroSign
    
    df = pd.concat([df,cf]).reset_index(drop=True)

In [3]:
df0 = df[['Text Post','ID Poster']]

In [22]:
df1 = df0[df0['ID Poster'].isin(df0['ID Poster'].unique()[:10])]

In [23]:
df1

Unnamed: 0,Text Post,ID Poster
0,"Well, everyone got up and going this morning. ...",1000331
1,My four-year old never stops talking. She'll ...,1000331
2,"Actually it's not raining yet, but I bought 15...",1000331
3,Ha! Just set up my RSS feed - that is so easy!...,1000331
4,"Oh, which just reminded me, we were talking ab...",1000331
...,...,...
1880,"Look out, Cap'n! Rapids ahead! urlLink",1013637
1881,We've sprung a leak! Bail out! Darn porous-fib...,1013637
1882,Captain's Log: I dragged her back to drydock f...,1013637
1883,We watched Donnie Darko tonight. Wonderful mov...,1013637


In [4]:
df1 = pd.get_dummies(data=df1,prefix='ID',columns=['ID Poster'])

In [5]:
df2 = df1.sample(frac=1).reset_index(drop=True)

In [6]:
df2

Unnamed: 0,Text Post,ID_1000331,ID_1000866,ID_1004904,ID_1005076,ID_1007188,ID_1008329,ID_1009572,ID_1011153,ID_1011311,ID_1013637
0,I'm at Kelsey's (Kdawg) house. She's getting r...,0,0,0,0,0,1,0,0,0,0
1,Today was ok. I took lots of Ibuprofen so my f...,0,1,0,0,0,0,0,0,0,0
2,"December 7, 1941. ""A date that will live in in...",0,0,0,0,0,0,0,0,0,1
3,"Uh oh, I'm in my mode. The one where everythin...",0,1,0,0,0,0,0,0,0,0
4,Rocking the wait list.... Oh well. I guess it...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1880,"So, on the one hand, throwing up sucks, but......",0,1,0,0,0,0,0,0,0,0
1881,"Wow, so, yes. Yesterday was fun. i went the wh...",0,1,0,0,0,0,0,0,0,0
1882,"A hoy-hoy (if you watch the Simpsons, that's h...",0,0,0,0,0,0,0,0,0,1
1883,"I am absolutely exhausted so, this will probab...",0,1,0,0,0,0,0,0,0,0


In [7]:
training_pctg = 0.1

In [8]:
training_sentences = sent_tokenize(''.join(df2[:int(df2.shape[0]*training_pctg)]['Text Post'].tolist()))

tokernizer = Tokenizer()
tokernizer.fit_on_texts(training_sentences)

In [9]:
num_sent_used = 3

df2['Encoded Text'] = df2['Text Post'].apply(lambda x: tokernizer.texts_to_matrix(sent_tokenize(x)).transpose())
df2['Encoded Text'] = df2['Encoded Text'].apply(lambda x: x[:,:num_sent_used] if x.shape[1]>=num_sent_used else np.concatenate([x,np.zeros([x.shape[0],num_sent_used-x.shape[1]])],axis=1))

In [10]:
X = df2['Encoded Text'].values

In [11]:
y = df2[[colname for colname in df2.columns.tolist() if 'ID_' in colname]].values

In [12]:
X[1]

array([[0., 0., 0.],
       [0., 1., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [13]:
y[1]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
Xv = np.array([x.reshape(-1) for x in X])
yv = y

In [16]:
randomized_indices = rnd.permutation(np.arange(0,Xv.shape[0]))

In [17]:
tt_index = int(len(randomized_indices)*0.85)

In [18]:
Xv_train = Xv[:tt_index]
yv_train = yv[:tt_index]

Xv_test = Xv[tt_index:]
yv_test = yv[tt_index:]

In [19]:
rfc = RandomForestClassifier()

In [20]:
rfc.fit(Xv_train,yv_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
rfc.score(Xv_test,yv_test)

0.31095406360424027