In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from ekphrasis.dicts.noslang.slangdict import slangdict
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.preprocessor import TextPreProcessor

#train the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns

#vectorize the text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [2]:
with open("./data/train_neg_full.txt") as f:
    df_train_neg = [l for l in f.read().split("\n") if len(l)>0]
    df_train_neg = pd.DataFrame(df_train_neg,columns=["text"])
    df_train_neg["label"] = -np.ones(df_train_neg.shape[0])
with open("./data/train_pos_full.txt") as f:
    df_train_pos= [l for l in f.read().split("\n") if len(l)>0]
    df_train_pos = pd.DataFrame(df_train_pos,columns=["text"])
    df_train_pos["label"] = np.ones(df_train_pos.shape[0])
df_train = pd.concat([df_train_neg,df_train_pos],axis=0,ignore_index=True)
df_train

Unnamed: 0,text,label
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,-1.0
1,glad i dot have taks tomorrow ! ! #thankful #s...,-1.0
2,1-3 vs celtics in the regular season = were fu...,-1.0
3,<user> i could actually kill that girl i'm so ...,-1.0
4,<user> <user> <user> i find that very hard to ...,-1.0
...,...,...
2499995,a warning sign ? (; rt <user> the negativity y...,1.0
2499996,<user> ff too thank youuu ) ),1.0
2499997,i just love shumpa ! that's my girl,1.0
2499998,the best way to start a day ! no matter what h...,1.0


In [3]:
df_test = pd.read_csv('./data/test_data.txt', sep = '\t', header = None)
df_test.columns = ['text']
#drop the strings before the first , on the test data
df_test['text'] = df_test['text'].apply(lambda x: x.split(',', 1)[1])	

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# change 1 
tokenizer=TweetTokenizer()

def handle_repeating_char(text):
    """
    Normalize to 2 repetitions of a single char.
    When a char is repeated at least 2 times, keep only 2 repetitions.
    e.g. "goood" becomes "good"
    """
    return re.sub(r'(.)\1+', r'\1\1', text)

text_processor = TextPreProcessor(
   # terms that will be normalized 
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user','time', 'url', 'date', 'number'],
    # corpus from which the word statistics are going to be used for word segmentation 
    segmenter="twitter",
    # corpus from which the word statistics are going to be used for spell correction
    corrector="twitter",
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=tokenizer.tokenize,
    #list of dictionaries, for replacing tokens extracted from the text,
    #with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)


def clean_text(text, lemmatize = True, remove_stopwords = True, caseFolding = True, slang = True, double = True, text_cleaning = True):
    
    
    if caseFolding:
        text = text.lower()
    text = tokenizer.tokenize(text)
    if slang :
        text = [(slangdict[w] if w in slangdict else w) for w in text]
    if double:
        text = [handle_repeating_char(word) for word in text]
    text = ' '.join(text)
    if text_cleaning:
        text = text_processor.pre_process_doc(text)
    if remove_stopwords:
        text = [word for word in text if word not in stop_words]
    if lemmatize:
        text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text


 

Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [5]:
df_train_cleaned = pd.DataFrame(df_train['text'].apply(lambda text : clean_text(text, remove_stopwords=False)))
df_train_cleaned["label"] = df_train["label"]
df_train_cleaned

In [None]:
df_test_cleaned = df_test['text'].apply(lambda text : clean_text(text, remove_stopwords=False))

In [17]:
df_train_cleaned.to_csv("./data/train_cleaned.txt",index=False)

In [None]:
df_test_cleaned.to_csv("./data/test_cleaned.txt",index=False)

In [None]:
df_train_cleaned.apply(lambda x : len(x.split(" "))).sort_values(ascending=False).iloc[:1000]

In [3]:
df_train_cleaned = pd.read_csv("./data/train_cleaned.txt")
df_train_cleaned.text.apply(lambda x: len(x.split(" "))).sort_values(ascending=False)

1462494    130
1657593    109
269709     105
1588769    103
965559      98
          ... 
1694546      1
1694545      1
1694544      1
1694543      1
1694390      1
Name: text, Length: 2500000, dtype: int64