In [2]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import pandas as pd
import re
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

# Define helper functions for feature engineering

- Number of special characters (done)
- Number of words per tweet
- Number of characters per tweet
- Number of characters
- Average word length (done)
- Ratio of stopwords (done)
- Existence of handle (done)
- Existence of link  (done)
- IsRetweet 'RT' (done)
- Number of uppercase words (done)
- Remove hashtag (done)

In [3]:
def data_preprocess(data_line):
    hashtag=r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)" # hash-tags
    data_line = re.sub(hashtag, "", data_line)
    data_line = re.sub(r'http\S+', 'http', data_line)
    data_line = data_line.replace("@handle","handle")
    data_line = data_line.replace("\n","")
    data_line = data_line.split("\t")
    return data_line

def get_stopword_ratio(data_line):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(data_line)
    stop_word_list = [w for w in word_tokens if w in stop_words]
    return len(stop_word_list)/len(word_tokens)
  
def is_RT(tweet_line):
    if len(tweet_line) > 0: 
        return tweet_line.split(" ")[0] == "RT"
    return False
        
    
def is_modifiedRT(tweet_line):
    if is_RT(tweet_line):
        return False
    elif (re.search(r'RT @handle', tweet_line)):
        return True
    else:
        return False
      
def get_ratio_emoticon(tweet_line):
    emoticons_str = r"[:=;xX][oO0\-]?[dD\)\]\(\]/\\o0OpP]"
    emo_puc = r"[!+?+]"
    emo_list = re.findall(emoticons_str, tweet_line)
    emo_list += re.findall(emo_puc, tweet_line)
    return len(emo_list)/len(tweet_line)
  
def get_average_word_length(data_line):
    words = data_line.split()
    return sum(len(word) for word in words) / len(words)
  
def get_ratio_upper_characters(data_line):
    u = [x for x in data_line if x.isupper()]
    return len(u)/len(data_line)
  
def get_ratio_special_characters(data_line):
    return sum([data_line.count(i) for i in ["!","?"]])/len(data_line)
  
def get_num_words(data_line):
    return len(data_line.split())  
  
  


# Read data

We read in the data line by line. The above helper functions are you used create features.

First we define list of features then in the last step we put it all together in a dataframe. It is about 10x faster.

In [4]:
def read_prep(tweets_file):
    author_Id = [] 
    tweet = [] 
    num_words =  [] 
    ratio_spec_char = []
    ratio_stopwords = []
    ratio_upper = []
    average_word_length = []
    is_retweet = []
    is_modified_retweet = [] 
    ratio_emoticon = [] 

    with open(tweets_file, "r") as file:
        for i in file:
            data_line = i
            author_Id.append( int(data_preprocess(data_line)[0]))
            tweet.append( data_preprocess(data_line)[1])
            num_words.append(  get_num_words(data_line))
            ratio_spec_char.append( get_ratio_special_characters(data_line))
            ratio_stopwords.append( get_stopword_ratio(data_line))
            ratio_upper.append( get_ratio_upper_characters(data_line))
            average_word_length.append( get_average_word_length(data_line))
            is_retweet.append( is_RT(data_line))
            is_modified_retweet.append( is_modifiedRT(data_line))
            ratio_emoticon.append( get_ratio_emoticon(data_line))

    df = pd.DataFrame({'author_Id':author_Id,
                       'tweet':tweet,
                       'num_words':num_words,
                       'ratio_spec_char':ratio_spec_char,
                       'ratio_stopwords':ratio_stopwords,
                       'ratio_upper':ratio_upper,
                       'average_word_length':average_word_length,
                       'is_retweet':is_retweet,
                       'is_modified_retweet':is_modified_retweet,
                       'ratio_emoticon':ratio_emoticon})
    return df

In [5]:
from sklearn.model_selection import train_test_split
clean_train_data = read_prep("data/train_tweets.txt")
y = clean_train_data.author_Id
X = clean_train_data.drop('author_Id', axis=1)

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.25, random_state=69)

In [17]:
print(len(X_train))
print(len(X_dev))
print(len(y_train))
print(len(y_dev))

246699
82233
246699
82233


In [18]:
train_data = pd.concat([X_train, y_train],axis=1)
dev_data = pd.concat([X_dev, y_dev],axis=1)
train_data.to_csv("data/train_data.csv")
dev_data.to_csv("data/dev_data.csv")

In [19]:
print('\nX_train:\n')
print(X_train.head())
print('\ny_train:\n')
print(y_train.head())
print('\ntrain_data:\n')
print(train_data.head())


X_train:

                                                    tweet  num_words  \
236809  handle I miss you Hun. I've been grindin. Imma...         26   
199574  handle Oh it happened...I saw the belt on the ...         21   
158427  won a signed poster and tkts to handle at hand...         20   
256458                             I want Lisa's earrings          5   
201048                            handle Wellllll... http          4   

        ratio_spec_char  ratio_stopwords  ratio_upper  average_word_length  \
236809         0.000000         0.285714     0.059701             4.153846   
199574         0.000000         0.333333     0.026549             4.380952   
158427         0.009434         0.259259     0.018868             4.300000   
256458         0.000000         0.000000     0.076923             4.200000   
201048         0.000000         0.000000     0.086957            10.500000   

        is_retweet  is_modified_retweet  ratio_emoticon  
236809       False           