# Feature Extraction

In [1]:
import pandas as pd

raw_data = pd.read_csv(r"d:\Code\ml\troll-tweets\data\original\IRAhandle_tweets_2.csv")
raw_data.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,...,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,2497991305,AUSTINLOVESBEER,NHS fails to treat one in six cancer patients ...,United States,English,3/8/2017 9:00,3/8/2017 9:00,41,34,176,...,Right,1,RightTroll,0,2497991305,839400338515881984,http://twitter.com/2497991305/statuses/8394003...,http://trib.al/hBZijWg,,
1,2497991305,AUSTINLOVESBEER,Real reason Alexis Sanchez walked out of Arsen...,United States,English,3/8/2017 9:00,3/8/2017 9:00,41,34,180,...,Right,1,RightTroll,0,2497991305,839400484326690817,http://twitter.com/2497991305/statuses/8394004...,https://twitter.com/SunSport/status/8393826823...,http://thesun.uk/60148X0Di,
2,2497991305,AUSTINLOVESBEER,George Michael cause of death revealed: What i...,United States,English,3/8/2017 9:00,3/8/2017 9:00,41,34,177,...,Right,1,RightTroll,0,2497991305,839400379238379520,http://twitter.com/2497991305/statuses/8394003...,https://twitter.com/Daily_Star/status/83938393...,http://bit.ly/2mewcAw,
3,2497991305,AUSTINLOVESBEER,Russian TV crew 'offer Swedish teenagers money...,United States,English,3/8/2017 9:00,3/8/2017 9:00,41,34,178,...,Right,1,RightTroll,0,2497991305,839400406463631360,http://twitter.com/2497991305/statuses/8394004...,http://ind.pn/2lXx9w2,,
4,2497991305,AUSTINLOVESBEER,Donald Trump met Russian ambassador during ele...,United States,English,3/8/2017 9:00,3/8/2017 9:00,41,34,179,...,Right,1,RightTroll,0,2497991305,839400447295160321,http://twitter.com/2497991305/statuses/8394004...,http://ind.pn/2lCVb3r,,


In [None]:
# keep only the columns we need
keep_cols = [
        "content",
        "region",
        "language",
        "publish_date",
        "following",
        "followers",
        "updates",
        "account_type",
        "retweet",
        "account_category",
    ]

df = raw_data[keep_cols]

df.to_csv('1_trimmed.csv', index=False)
df.head()

Unnamed: 0,content,region,language,publish_date,following,followers,updates,account_type,retweet,account_category
0,NHS fails to treat one in six cancer patients ...,United States,English,3/8/2017 9:00,41,34,176,Right,1,RightTroll
1,Real reason Alexis Sanchez walked out of Arsen...,United States,English,3/8/2017 9:00,41,34,180,Right,1,RightTroll
2,George Michael cause of death revealed: What i...,United States,English,3/8/2017 9:00,41,34,177,Right,1,RightTroll
3,Russian TV crew 'offer Swedish teenagers money...,United States,English,3/8/2017 9:00,41,34,178,Right,1,RightTroll
4,Donald Trump met Russian ambassador during ele...,United States,English,3/8/2017 9:00,41,34,179,Right,1,RightTroll


In [3]:
# describe the data
df.describe()

Unnamed: 0,following,followers,updates,retweet
count,250520.0,250520.0,250520.0,250520.0
mean,2503.84539,3021.228425,5546.732744,0.43478
std,2937.808532,5494.761345,5345.45039,0.495729
min,0.0,0.0,1.0,0.0
25%,412.0,387.0,1505.0,0.0
50%,1711.0,1113.0,3441.0,0.0
75%,4151.0,2726.0,8554.0,1.0
max,30194.0,40788.0,24248.0,1.0


In [4]:
# number of null values
df.isnull().sum()

content               0
region              522
language              0
publish_date          0
following             0
followers             0
updates               0
account_type          0
retweet               0
account_category      0
dtype: int64

In [5]:
# percentage of null values
df.isnull().sum() / len(df)

content             0.000000
region              0.002084
language            0.000000
publish_date        0.000000
following           0.000000
followers           0.000000
updates             0.000000
account_type        0.000000
retweet             0.000000
account_category    0.000000
dtype: float64

In [6]:
# followers to following ratio
import numpy as np

derived = pd.DataFrame()
derived["followers_to_following_ratio"] = df["followers"].div(df["following"].replace(0, np.nan))
derived['date'] = pd.to_datetime(df['publish_date'], errors='coerce')
derived['hour_of_day'] = derived['date'].dt.hour
derived['day_of_week'] = derived['date'].dt.dayofweek
derived['day_of_month'] = derived['date'].dt.day

derived.head()

Unnamed: 0,followers_to_following_ratio,date,hour_of_day,day_of_week,day_of_month
0,0.829268,2017-03-08 09:00:00,9,2,8
1,0.829268,2017-03-08 09:00:00,9,2,8
2,0.829268,2017-03-08 09:00:00,9,2,8
3,0.829268,2017-03-08 09:00:00,9,2,8
4,0.829268,2017-03-08 09:00:00,9,2,8


In [7]:
# Extract entities from the text
import re
def extract_hashtags(text):
    return {
        'hashtags': re.findall(r'#\w+', text),
        'count': len(re.findall(r'#\w+', text))
    }

def extract_mentions(text):
    return {
        'mentions': re.findall(r'@\w+', text),
        'count': len(re.findall(r'@\w+', text))
    }

def count_emojis(text):
    return len(re.findall(r'[^\x00-\x7F]+', text))

def count_special_characters(text):
    return len(re.findall(r'[^a-zA-Z0-9\s]', text))

def all_caps(text):
    return all(word.isupper() for word in text.split())

def count_links(text):
    return len(re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text))

def has_quote(text):
    return 1 if re.search(r'["\'].*["\']', text) else 0

def starts_with_mention(text):
    stripped = text.lstrip()  # Remove leading whitespace
    patterns = [
        r'^@',       # Matches strings starting with "@"
        r'^RT @',    # Matches strings starting with "RT @"
        r'^MT @',    # Matches strings starting with "MT @"
        r'^\'@'       # Matches strings starting with "'@"
    ]
    for pattern in patterns:
        if re.match(pattern, stripped):
            return 1  # Indicates a match
    return 0  # No match
    
def starts_with_hashtag(text):
    stripped = text.lstrip()  # Remove leading whitespace
    patterns = [
        r'^#',       # Matches strings starting with "@"
        r'^\'#'       # Matches strings starting with "'@"
    ]
    for pattern in patterns:
        if re.match(pattern, stripped):
            return 1  # Indicates a match
    return 0  # No match

derived['hashtags'] = df['content'].apply(extract_hashtags).apply(lambda x: ', '.join(x['hashtags']))
derived['mentions'] = df['content'].apply(extract_mentions).apply(lambda x: ', '.join(x['mentions']))
derived['count_hashtags'] = df['content'].apply(extract_hashtags).apply(lambda x: x['count'])
derived['count_mentions'] = df['content'].apply(extract_mentions).apply(lambda x: x['count'])
derived['count_emojis'] = df['content'].apply(count_emojis)
derived['count_special_characters'] = df['content'].apply(count_special_characters)
derived['word_count'] = df['content'].apply(lambda x: len(str(x).split()))
derived['count_links'] = df['content'].apply(count_links)
derived['text_length'] = df['content'].apply(len)
derived['all_words_caps'] = df['content'].apply(all_caps).apply(lambda x: 1 if x else 0)
derived['has_quote'] = df['content'].apply(has_quote)
derived['starts_with_mention'] = df['content'].apply(starts_with_mention)
derived['starts_with_hashtag'] = df['content'].apply(starts_with_hashtag)

# check zero values
derived.describe()

Unnamed: 0,followers_to_following_ratio,date,hour_of_day,day_of_week,day_of_month,count_hashtags,count_mentions,count_emojis,count_special_characters,word_count,count_links,text_length,all_words_caps,has_quote,starts_with_mention,starts_with_hashtag
count,228475.0,250520,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0,250520.0
mean,6.613766,2016-10-04 17:30:44.027622656,12.824118,2.791494,15.242372,0.530325,0.357492,1.311847,13.817021,12.22391,0.959444,96.280513,0.000351,0.112358,0.037973,0.110957
min,0.0,2012-10-30 12:27:00,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.528211,2016-04-01 15:31:00,8.0,1.0,7.0,0.0,0.0,0.0,6.0,9.0,0.0,73.0,0.0,0.0,0.0,0.0
50%,1.036735,2016-10-31 23:34:00,14.0,3.0,15.0,0.0,0.0,0.0,8.0,12.0,1.0,98.0,0.0,0.0,0.0,0.0
75%,1.479497,2017-05-09 15:39:00,18.0,4.0,23.0,1.0,0.0,1.0,12.0,15.0,1.0,124.0,0.0,0.0,0.0,0.0
max,28842.0,2018-05-27 12:54:00,23.0,6.0,31.0,14.0,50.0,44.0,231.0,69.0,5.0,767.0,1.0,1.0,1.0,1.0
std,116.094867,,6.437541,1.968604,8.895742,0.984858,1.170742,3.199947,18.738247,5.440418,0.691248,34.306034,0.018739,0.315807,0.191131,0.31408


In [8]:
# save to project folder
derived.to_csv('1_derived.csv', index=False)
derived.head()

Unnamed: 0,followers_to_following_ratio,date,hour_of_day,day_of_week,day_of_month,hashtags,mentions,count_hashtags,count_mentions,count_emojis,count_special_characters,word_count,count_links,text_length,all_words_caps,has_quote,starts_with_mention,starts_with_hashtag
0,0.829268,2017-03-08 09:00:00,9,2,8,,,0,0,0,5,12,1,77,0,0,0,0
1,0.829268,2017-03-08 09:00:00,9,2,8,,,0,0,0,10,12,2,114,0,0,0,0
2,0.829268,2017-03-08 09:00:00,9,2,8,,,0,0,0,12,16,2,137,0,0,0,0
3,0.829268,2017-03-08 09:00:00,9,2,8,,,0,0,0,7,15,1,108,0,1,0,0
4,0.829268,2017-03-08 09:00:00,9,2,8,,,0,0,0,5,9,1,84,0,0,0,0


In [9]:
combined_df = pd.concat([df, derived], axis=1)

# Handle duplicated columns (if any)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
combined_df.to_csv('1_combined.csv', index=False)