In [1]:
import pickle

import pandas as pd
import numpy as np
# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# for text processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF library

# Import features

In [2]:
#Import training features
import pathlib
path_to_read_model = '..\\probleam_study' #Path of current working Directory
with open(path_to_read_model + '\\trainFeatures_list.pkl', 'rb') as f:
    X, y = pickle.load(f)

# PreProcessing Objects needed for Transformations

In [3]:
df = X.copy(deep=True)
print(">>shape before processing: ",df.shape)
df = df.drop(['id', 'text', 'author'], axis = 1) # drop unwanted columns
df = df[df['title'].notnull()] # eliminate null values in title column
# Duplicate elimination
df.drop_duplicates(inplace=True)
print(">>shape after processing: ",df.shape)

>>shape before processing:  (5824, 4)
>>shape after processing:  (5641, 1)


In [179]:
def text_transform(paragraph):
    ps = PorterStemmer()
    text = re.sub(r'\[[0-9]*\]',' ',paragraph).lower()
    text = re.sub(r'\d',' ',text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+',' ',text)
    sentences = nltk.sent_tokenize(text)
    corpus = []
    for i in range(len(sentences)):
        review = sentences[i].split() #converting to list of words
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus.append(review)
    return ' '.join(corpus)
df['title'] = df['title'].map(lambda para: text_transform(para))

In [180]:
df

Unnamed: 0,title
4995,hillari clinton berni sander meet battl end new york time
8280,donald trump hillari clinton edward snowden thursday even brief new york time
12778,liber get pass time media ignor leftist comment pedophilia breitbart
12612,trump support jail tri rig elect commit voter fraud
11342,air forc vet first american convict tri join islam state sentenc year
...,...
16569,clinton aid profit firm illeg rais million nonprofit clinton foundat
6777,trump ghostwrit explain beat debat new york time
2475,todd trump still asterisk breitbart
11061,gaiaport interweb gaia energet strengthen


In [170]:
cv = TfidfVectorizer()
X = cv.fit_transform(df['title'])

In [174]:
len(cv.get_feature_names())

8243

In [175]:
X.shape

(5641, 8243)

In [122]:
paragraph=df['title'][:1].values[0]
paragraph

'Hillary Clinton and Bernie Sanders Meet as Their Battle Ends - The New York Times'

In [123]:
ps = PorterStemmer()
text = re.sub(r'\[[0-9]*\]',' ',paragraph).lower()
text = re.sub(r'\d',' ',text)
text = re.sub('[^a-zA-Z]', ' ', text)
text = re.sub(r'\s+',' ',text)
text

'hillary clinton and bernie sanders meet as their battle ends the new york times'

In [124]:
sentences = nltk.sent_tokenize(text)
corpus = []
for i in range(len(sentences)):
    review = sentences[i].split() #converting to list of words
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
' '.join(corpus)

'hillari clinton berni sander meet battl end new york time'

# Data Transformation function

In [3]:
def Transformations(featureDF, target):
    print(">>feature shape before cleaning: ",featureDF.shape)
    print(">>target shape before cleaning: ",target.shape)
    featureDF = featureDF.drop(['id', 'text', 'author'], axis = 1) # drop unwanted columns
    featureDF = featureDF[featureDF['title'].notnull()] # eliminate null values in title column
    # Duplicate elimination
    featureDF.drop_duplicates(inplace=True)
    # Update y matrix based X
    ## since we've removed some data from X, we need to pass on these updations to y as well, as y doesn't know some of its corresponding X's have been deleted.
    target = target[featureDF.index]
    print(">>feature shape after cleaning: ",featureDF.shape)
    print(">>target shape after cleaning: ",target.shape)
    
    ps = PorterStemmer()
    def text_transform(paragraph):
        text = re.sub(r'\[[0-9]*\]',' ',paragraph).lower()
        text = re.sub(r'\d',' ',text)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = re.sub(r'\s+',' ',text)
        sentences = nltk.sent_tokenize(text)
        corpus = []
        for i in range(len(sentences)):
            review = sentences[i].split() #converting to list of words
            review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
            review = ' '.join(review)
            corpus.append(review)
        return ' '.join(corpus)
    
    featureDF['title'] = featureDF['title'].map(lambda para: text_transform(para))
    # Update y matrix based X
    ## since we've removed some data from X, we need to pass on these updations to y as well, as y doesn't know some of its corresponding X's have been deleted.
    target = target[featureDF.index]
    print(">>feature shape after preProcessing: ",featureDF.shape)
    print(">>target shape after preProcessing: ",target.shape)
    
    return featureDF, target

In [4]:

X, y = Transformations(X, y)

>>feature shape before cleaning:  (5824, 4)
>>target shape before cleaning:  (5824,)
>>feature shape after cleaning:  (5641, 1)
>>target shape after cleaning:  (5641,)
>>feature shape after preProcessing:  (5641, 1)
>>target shape after preProcessing:  (5641,)


In [5]:
X

Unnamed: 0,title
4995,hillari clinton berni sander meet battl end new york time
8280,donald trump hillari clinton edward snowden thursday even brief new york time
12778,liber get pass time media ignor leftist comment pedophilia breitbart
12612,trump support jail tri rig elect commit voter fraud
11342,air forc vet first american convict tri join islam state sentenc year
...,...
16569,clinton aid profit firm illeg rais million nonprofit clinton foundat
6777,trump ghostwrit explain beat debat new york time
2475,todd trump still asterisk breitbart
11061,gaiaport interweb gaia energet strengthen
