In [19]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import pandas as pd
import re
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

# Define helper functions for feature engineering

- Number of special characters (done)
- Number of words per tweet
- Number of characters per tweet
- Number of characters
- Average word length (done)
- Ratio of stopwords (done)
- Existence of handle (done)
- Existence of link  (done)
- IsRetweet 'RT' (done)
- Number of uppercase words (done)
- Remove hashtag (done)

In [120]:
def data_preprocess(data_line):
    hashtag=r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)" # hash-tags
    data_line = re.sub(hashtag, "", data_line)
    data_line = re.sub(r'http\S+', 'http', data_line)
    data_line = data_line.replace("@handle","handle")
    data_line = data_line.replace("\n","")
    data_line = data_line.split("\t")
    return data_line

def get_stopword_ratio(data_line):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(data_line)
    stop_word_list = [w for w in word_tokens if w in stop_words]
    return len(stop_word_list)/len(word_tokens)
  
def is_RT(tweet_line):
    if len(tweet_line) > 0: 
        return tweet_line.split(" ")[0] == "RT"
    return False
        
    
def is_modifiedRT(tweet_line):
    if is_RT(tweet_line):
        return False
    elif (re.search(r'RT @handle', tweet_line)):
        return True
    else:
        return False
      
def get_ratio_emoticon(tweet_line):
    emoticons_str = r"[:=;xX][oO0\-]?[dD\)\]\(\]/\\o0OpP]"
    emo_puc = r"[!+?+]"
    emo_list = re.findall(emoticons_str, tweet_line)
    emo_list += re.findall(emo_puc, tweet_line)
    return len(emo_list)/len(tweet_line)
  
def get_average_word_length(data_line):
    words = data_line.split()
    return sum(len(word) for word in words) / len(words)
  
def get_ratio_upper_characters(data_line):
    u = [x for x in data_line if x.isupper()]
    return len(u)/len(data_line)
  
def get_ratio_special_characters(data_line):
    return sum([data_line.count(i) for i in ["!","?"]])/len(data_line)
  
def get_num_words(data_line):
    return len(data_line.split())  
  
  


# Read data

We read in the data line by line. The above helper functions are you used create features.

First we define list of features then in the last step we put it all together in a dataframe. It is about 10x faster.

In [130]:
def read_prep(tweets_file):
    author_Id = [] 
    tweet = [] 
    num_words =  [] 
    ratio_spec_char = []
    ratio_stopwords = []
    ratio_upper = []
    average_word_length = []
    is_retweet = []
    is_modified_retweet = [] 
    ratio_emoticon = [] 

    with open(tweets_file, "r") as file:
        for i in file:
            data_line = file.readline()
            author_Id.append( int(data_preprocess(data_line)[0]))
            tweet.append( data_preprocess(data_line)[1])
            num_words.append(  get_num_words(data_line))
            ratio_spec_char.append( get_ratio_special_characters(data_line))
            ratio_stopwords.append( get_stopword_ratio(data_line))
            ratio_upper.append( get_ratio_upper_characters(data_line))
            average_word_length.append( get_average_word_length(data_line))
            is_retweet.append( is_RT(data_line))
            is_modified_retweet.append( is_modifiedRT(data_line))
            ratio_emoticon.append( get_ratio_emoticon(data_line))
            
    
    df = pd.DataFrame({'author_Id':author_Id,
                       'tweet':tweet,
                       'num_words':num_words,
                       'ratio_spec_char':ratio_spec_char,
                       'ratio_stopwords':ratio_stopwords,
                       'ratio_upper':ratio_upper,
                       'average_word_length':average_word_length,
                       'is_retweet':is_retweet,
                       'is_modified_retweet':is_modified_retweet,
                       'ratio_emoticon':ratio_emoticon})
    return df

In [131]:
clean_train_data = read_prep("train_tweets.txt")
clean_train_data.to_csv("clean_train_data.csv")
clean_train_data.head()

Unnamed: 0,author_Id,tweet,num_words,ratio_spec_char,ratio_stopwords,ratio_upper,average_word_length,is_retweet,is_modified_retweet,ratio_emoticon
0,8746,Going to watch Grey's on the big screen - Thur...,12,0.0,0.2,0.041667,5.0,False,False,0.0
1,8746,handle Hi there! Been traveling a lot and lots...,28,0.021739,0.3125,0.021739,3.928571,False,False,0.021739
2,8746,RT handle: Ft. Hood officials confirm the 2 ot...,18,0.0,0.238095,0.036036,5.166667,False,True,0.0
3,8746,handle How did u get the invite Justin?,9,0.021739,0.181818,0.043478,4.111111,False,False,0.021739
4,8746,handle I remember! I am fine - how are u? What...,13,0.05,0.166667,0.05,3.615385,False,False,0.05


In [133]:
pd.read_csv("clean_train_data.csv")

Unnamed: 0.1,Unnamed: 0,author_Id,tweet,num_words,ratio_spec_char,ratio_stopwords,ratio_upper,average_word_length,is_retweet,is_modified_retweet,ratio_emoticon
0,0,8746,Going to watch Grey's on the big screen - Thur...,12,0.000000,0.200000,0.041667,5.000000,False,False,0.000000
1,1,8746,handle Hi there! Been traveling a lot and lots...,28,0.021739,0.312500,0.021739,3.928571,False,False,0.021739
2,2,8746,RT handle: Ft. Hood officials confirm the 2 ot...,18,0.000000,0.238095,0.036036,5.166667,False,True,0.000000
3,3,8746,handle How did u get the invite Justin?,9,0.021739,0.181818,0.043478,4.111111,False,False,0.021739
4,4,8746,handle I remember! I am fine - how are u? What...,13,0.050000,0.166667,0.050000,3.615385,False,False,0.050000
5,5,8746,handle I don't want to picture u sitting on it...,17,0.000000,0.238095,0.024096,3.882353,False,False,0.000000
6,6,8746,handle Grrr....you must be going crazy!,7,0.021739,0.090909,0.021739,5.571429,False,False,0.021739
7,7,8746,RT handle: If you're looking for some great li...,18,0.009009,0.250000,0.045045,5.166667,False,True,0.018018
8,8,8746,"RT handle: Director of Global Brand Marketing,...",21,0.000000,0.064516,0.092199,5.714286,False,True,0.014184
9,9,8746,"RT handle: ""Only surround yourself with people...",16,0.000000,0.227273,0.046729,5.687500,False,True,0.000000
