# Feature Extraction

In [1]:
import pandas as pd

raw_data = pd.read_csv(r"d:\Code\ml\troll-tweets\data\original\IRAhandle_tweets_1.csv")
raw_data.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,...,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,906000000000000000,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,...,Right,0,RightTroll,0,905874659358453760,914580356430536707,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914580356430...,,
1,906000000000000000,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,...,Right,0,RightTroll,0,905874659358453760,914621840496189440,http://twitter.com/905874659358453760/statuses...,https://twitter.com/damienwoody/status/9145685...,,
2,906000000000000000,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,...,Right,1,RightTroll,0,905874659358453760,914623490375979008,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/913231923715...,,
3,906000000000000000,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,...,Right,0,RightTroll,0,905874659358453760,914639143690555392,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914639143690...,,
4,906000000000000000,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,...,Right,1,RightTroll,0,905874659358453760,914312219952861184,http://twitter.com/905874659358453760/statuses...,https://twitter.com/realDonaldTrump/status/914...,,


In [2]:
# keep only the columns we need
keep_cols = [
        "content",
        "region",
        "language",
        "publish_date",
        "following",
        "followers",
        "updates",
        "account_type",
        "retweet",
        "account_category",
    ]

df = raw_data[keep_cols]

df.to_csv('1_trimmed.csv', index=False)
df.head()

Unnamed: 0,content,region,language,publish_date,following,followers,updates,account_type,retweet,account_category
0,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,1052,9636,253,Right,0,RightTroll
1,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,1054,9637,254,Right,0,RightTroll
2,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,1054,9637,255,Right,1,RightTroll
3,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,1062,9642,256,Right,0,RightTroll
4,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,1050,9645,246,Right,1,RightTroll


In [3]:
# describe the data
df.describe()

Unnamed: 0,following,followers,updates,retweet
count,243891.0,243891.0,243891.0,243891.0
mean,2008.342079,2256.398219,6433.557208,0.633857
std,3625.387602,4781.906074,9520.379342,0.48175
min,0.0,0.0,1.0,0.0
25%,149.0,140.0,1052.0,0.0
50%,939.0,611.0,2759.0,1.0
75%,2284.0,2186.0,7603.0,1.0
max,21843.0,23890.0,70028.0,1.0


In [4]:
# number of null values
df.isnull().sum()

content              0
region              38
language             0
publish_date         0
following            0
followers            0
updates              0
account_type         0
retweet              0
account_category     0
dtype: int64

In [5]:
# percentage of null values
df.isnull().sum() / len(df)

content             0.000000
region              0.000156
language            0.000000
publish_date        0.000000
following           0.000000
followers           0.000000
updates             0.000000
account_type        0.000000
retweet             0.000000
account_category    0.000000
dtype: float64

In [6]:
# followers to following ratio
import numpy as np

derived = pd.DataFrame()
derived["followers_to_following_ratio"] = df["followers"].div(df["following"].replace(0, np.nan))
derived['date'] = pd.to_datetime(df['publish_date'], errors='coerce')
derived['hour_of_day'] = derived['date'].dt.hour
derived['day_of_week'] = derived['date'].dt.dayofweek
derived['day_of_month'] = derived['date'].dt.day

derived.head()

Unnamed: 0,followers_to_following_ratio,date,hour_of_day,day_of_week,day_of_month
0,9.159696,2017-10-01 19:58:00,19,6,1
1,9.143264,2017-10-01 22:43:00,22,6,1
2,9.143264,2017-10-01 22:50:00,22,6,1
3,9.079096,2017-10-01 23:52:00,23,6,1
4,9.185714,2017-10-01 02:13:00,2,6,1


In [7]:
# Extract entities from the text
import re
def extract_hashtags(text):
    return {
        'hashtags': re.findall(r'#\w+', text),
        'count': len(re.findall(r'#\w+', text))
    }

def extract_mentions(text):
    return {
        'mentions': re.findall(r'@\w+', text),
        'count': len(re.findall(r'@\w+', text))
    }

def count_emojis(text):
    return len(re.findall(r'[^\x00-\x7F]+', text))

def count_special_characters(text):
    return len(re.findall(r'[^a-zA-Z0-9\s]', text))

def all_caps(text):
    return all(word.isupper() for word in text.split())

def count_links(text):
    return len(re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text))

def has_quote(text):
    return 1 if re.search(r'["\'].*["\']', text) else 0

def starts_with_mention(text):
    stripped = text.lstrip()  # Remove leading whitespace
    patterns = [
        r'^@',       # Matches strings starting with "@"
        r'^RT @',    # Matches strings starting with "RT @"
        r'^MT @',    # Matches strings starting with "MT @"
        r'^\'@'       # Matches strings starting with "'@"
    ]
    for pattern in patterns:
        if re.match(pattern, stripped):
            return 1  # Indicates a match
    return 0  # No match
    
def starts_with_hashtag(text):
    stripped = text.lstrip()  # Remove leading whitespace
    patterns = [
        r'^#',       # Matches strings starting with "@"
        r'^\'#'       # Matches strings starting with "'@"
    ]
    for pattern in patterns:
        if re.match(pattern, stripped):
            return 1  # Indicates a match
    return 0  # No match

derived['hashtags'] = df['content'].apply(extract_hashtags).apply(lambda x: ', '.join(x['hashtags']))
derived['mentions'] = df['content'].apply(extract_mentions).apply(lambda x: ', '.join(x['mentions']))
derived['count_hashtags'] = df['content'].apply(extract_hashtags).apply(lambda x: x['count'])
derived['count_mentions'] = df['content'].apply(extract_mentions).apply(lambda x: x['count'])
derived['count_emojis'] = df['content'].apply(count_emojis)
derived['count_special_characters'] = df['content'].apply(count_special_characters)
derived['word_count'] = df['content'].apply(lambda x: len(str(x).split()))
derived['count_links'] = df['content'].apply(count_links)
derived['text_length'] = df['content'].apply(len)
derived['all_words_caps'] = df['content'].apply(all_caps).apply(lambda x: 1 if x else 0)
derived['has_quote'] = df['content'].apply(has_quote)
derived['starts_with_mention'] = df['content'].apply(starts_with_mention)
derived['starts_with_hashtag'] = df['content'].apply(starts_with_hashtag)

# check zero values
derived.describe()

Unnamed: 0,followers_to_following_ratio,date,hour_of_day,day_of_week,day_of_month,count_hashtags,count_mentions,count_emojis,count_special_characters,word_count,count_links,text_length,all_words_caps,has_quote,starts_with_mention,starts_with_hashtag
count,240418.0,243891,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0,243891.0
mean,2.841317,2016-09-28 00:31:14.841630464,12.328794,2.842061,15.393569,0.711691,0.247406,1.988032,18.644837,13.396981,1.024798,104.590383,0.000566,0.141563,0.05122,0.19086
min,0.0,2014-11-27 09:59:00,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.242647,2015-12-19 14:52:00,8.0,1.0,9.0,0.0,0.0,0.0,6.0,10.0,0.0,81.0,0.0,0.0,0.0,0.0
50%,1.062475,2016-11-28 14:37:00,13.0,3.0,15.0,0.0,0.0,0.0,10.0,13.0,1.0,110.0,0.0,0.0,0.0,0.0
75%,1.461847,2017-05-04 20:17:00,17.0,4.0,21.0,1.0,0.0,1.0,15.0,17.0,2.0,132.0,0.0,0.0,0.0,0.0
max,8752.0,2018-03-22 01:57:00,23.0,6.0,31.0,24.0,50.0,44.0,193.0,77.0,6.0,816.0,1.0,1.0,1.0,1.0
std,40.889037,,6.483359,1.940397,7.966292,1.166709,0.990731,3.893024,23.16642,5.529366,0.752663,33.985203,0.02378,0.348602,0.220446,0.39298


In [8]:
# save to project folder
derived.to_csv('1_derived.csv', index=False)
derived.head()

Unnamed: 0,followers_to_following_ratio,date,hour_of_day,day_of_week,day_of_month,hashtags,mentions,count_hashtags,count_mentions,count_emojis,count_special_characters,word_count,count_links,text_length,all_words_caps,has_quote,starts_with_mention,starts_with_hashtag
0,9.159696,2017-10-01 19:58:00,19,6,1,,@nedryun,0,1,0,11,24,1,156,0,1,0,0
1,9.143264,2017-10-01 22:43:00,22,6,1,,,0,0,0,8,21,1,140,0,0,0,0
2,9.143264,2017-10-01 22:50:00,22,6,1,#BoycottNFL,,1,0,0,8,18,1,143,0,0,0,0
3,9.079096,2017-10-01 23:52:00,23,6,1,,,0,0,0,8,20,1,145,0,0,0,0
4,9.185714,2017-10-01 02:13:00,2,6,1,#StandForOurAnthem,,1,0,1,10,7,1,83,0,0,0,0


In [9]:
combined_df = pd.concat([df, derived], axis=1)

# Handle duplicated columns (if any)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
combined_df.to_csv('1_combined.csv', index=False)