In [19]:
!pip install vaderSentiment



In [20]:
pip install langdetect



In [21]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from langdetect import detect, LangDetectException
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
twitter_df = pd.read_csv('/content/drive/MyDrive/personal project/twitter_df.csv')

In [24]:
# Cell 2: Define helper functions
def is_english(text):
    if not isinstance(text, str) or text.strip() == '':
        return False
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def count_words(text):
    return len(text.split())

def process_tweet(text):
    hashtags = re.findall(r'#(\S+)', text)
    user_mentions = re.findall(r'@[\S]+', text)

    text = re.sub(r'(#\S+\s*){2,}', '#HASHTAG ', text)
    text = re.sub(r'(@[\S]+\s*){2,}', '@USER_MENTION ', text)
    text = re.sub(r'(https?://\S+|www\.\S+)(\s|$)', 'URL ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\+?\d[\d -]{8,}\d', '', text)

    return text, hashtags, user_mentions

def get_sentiment_scores(text):
    scores = analyzer.polarity_scores(text)

    if scores['pos'] > 0.24 and scores['neg'] == 0 and scores['compound'] > 0.45:
        category = 'positive'
    elif scores['neg'] > 0.1 or scores['neg'] > scores['pos']:
        category = 'negative'
    else:
        category = 'neutral'

    return {
        'scores': scores,
        'category': category
    }

def format_sentiment_scores(sentiment_dict):
    scores = sentiment_dict['scores']
    return f"{{'neg': {scores['neg']:.3f}, 'neu': {scores['neu']:.3f}, 'pos': {scores['pos']:.3f}, 'compound': {scores['compound']:.4f}}}"

def add_sentiment_analysis(df, text_column):
    df['sentiment_analysis'] = df[text_column].apply(get_sentiment_scores)
    df['sentiment_scores'] = df['sentiment_analysis'].apply(format_sentiment_scores)
    df['sentiment'] = df['sentiment_analysis'].apply(lambda x: x['category'])
    return df.drop(columns=['sentiment_analysis'])

In [25]:
# Twitter data cleaning
def clean_twitter_data(twitter_df):
    # Reset index to ensure unique index values
    twitter_df = twitter_df.reset_index(drop=True)

    # Clean the 'text' column and extract hashtags and user mentions
    processed_tweets = twitter_df['text'].apply(process_tweet)  # Apply process_tweet to each row
    twitter_df['cleaned_text'] = processed_tweets.apply(lambda x: x[0])  # Store cleaned text
    twitter_df['hashtags'] = processed_tweets.apply(lambda x: x[1])  # Store hashtags
    twitter_df['user_mentions'] = processed_tweets.apply(lambda x: x[2])  # Store user mentions

    # Remove "#HASHTAG", "@USER_MENTION", and "URL" from the cleaned_text column
    twitter_df['cleaned_text'] = twitter_df['cleaned_text'].str.replace('#HASHTAG', '', regex=False).str.replace('@USER_MENTION', '', regex=False).str.replace('URL', '', regex=False).str.strip()

    # Filter non-ASCII tweets and short tweets
    twitter_df = twitter_df[twitter_df['cleaned_text'].apply(lambda x: x.isascii())]
    twitter_df = twitter_df[twitter_df['cleaned_text'].apply(count_words) > 5]

    # Filter non-English tweets
    twitter_df = twitter_df[twitter_df['cleaned_text'].apply(is_english)]

    # Drop duplicates based on the 'cleaned_text' column
    twitter_df = twitter_df.drop_duplicates(subset=['cleaned_text'])

    # Drop the original 'text' column and rename 'cleaned_text' to 'text'
    twitter_df = twitter_df.drop(columns=['text'])
    twitter_df = twitter_df.rename(columns={'cleaned_text': 'text'})

    # Reset index again after dropping duplicates
    twitter_df = twitter_df.reset_index(drop=True)

    return twitter_df[['text', 'hashtags', 'user_mentions', 'date']]  # Now includes hashtags and user_mentions

twitter_df = clean_twitter_data(twitter_df)
twitter_df = add_sentiment_analysis(twitter_df, 'text')
twitter_df.to_csv('cleaned_twitter_data.csv', index=False)
print("Twitter data processed and saved.")
twitter_df.head()

Twitter data processed and saved.


Unnamed: 0,text,hashtags,user_mentions,date,sentiment_scores,sentiment
0,What do we know about the age reduction on the...,"[MigrationStrategy, NewMigrationStrategy, Aust...",[],2023-12-15T00:14:20.000Z,"{'neg': 0.000, 'neu': 1.000, 'pos': 0.000, 'co...",neutral
1,When the Migration Strategy will be implemented?,"[MigrationStrategy, NewMigrationStrategy, Aust...",[],2023-12-15T00:09:33.000Z,"{'neg': 0.000, 'neu': 1.000, 'pos': 0.000, 'co...",neutral
2,Thank you all people who supported the 485visa...,"[auspol, Australia.]",[],2022-02-05T09:04:16.000Z,"{'neg': 0.000, 'neu': 0.858, 'pos': 0.142, 'co...",neutral
3,Attention TR-485 visa holders! Is your Visa ex...,"[sc408, sc485, tr408, tr485, 408visa, 485visa,...",[],2023-06-02T07:28:34.000Z,"{'neg': 0.000, 'neu': 0.906, 'pos': 0.094, 'co...",neutral
4,"Who will compensate our wasted time, destroyed...",[],"[@CNN, @BBC, @SBS, @tomwconnell, @TuckerCarlso...",2021-10-16T19:32:36.000Z,"{'neg': 0.390, 'neu': 0.441, 'pos': 0.169, 'co...",negative


In [26]:
# Cell 4: Reddit data processing
def clean_reddit_data(posts_df, comments_df):
    posts_df = posts_df.rename(columns={'text_body': 'post_body', 'time_created': 'post_createdat'})
    comments_df = comments_df.rename(columns={'text_body': 'comment_body', 'time_created': 'comment_createdat'})

    df = posts_df.merge(comments_df, on='post_id', how='inner')
    df = df.drop(columns=[col for col in df.columns if 'Unnamed' in col])

    if 'user' in df.columns:
        df = df[df['user'] != 'AutoModerator']
        df = df.drop(columns=['user'])
    if 'href' in df.columns:
        df = df.drop(columns=['href'])

    df['post_body'] = df['post_body'].astype(str)
    df['comment_body'] = df['comment_body'].astype(str)

    df = df[df['post_body'].apply(is_english) & df['comment_body'].apply(is_english)]
    df = df[df['comment_body'].str.strip() != '']
    df = df[df['comment_body'].apply(count_words) >= 5]

    columns_order = ['post_id', 'post_body', 'post_createdat', 'comment_body', 'comment_createdat']
    df = df[columns_order]

    return df.drop_duplicates(subset=['post_body', 'comment_body']).reset_index(drop=True)

# Load and process Reddit data
reddit_posts = pd.read_csv('/content/drive/MyDrive/personal project/reddit_posts.csv')
reddit_comments = pd.read_csv('/content/drive/MyDrive/personal project/reddit_comments.csv')
reddit_df = clean_reddit_data(reddit_posts, reddit_comments)
reddit_df = add_sentiment_analysis(reddit_df, 'comment_body')
reddit_df.to_csv('cleaned_reddit_data.csv', index=False)
print("Reddit data processed and saved.")
reddit_df.head()

Reddit data processed and saved.


Unnamed: 0,post_id,post_body,post_createdat,comment_body,comment_createdat,sentiment_scores,sentiment
0,1br9mt3,Eligibility for 485 Visa (Combine 1 yr Bachelo...,2024-03-30T04:59:10.118Z,I believe 485 counted per course so cannot be ...,2024-03-30T05:07:25.059Z,"{'neg': 0.000, 'neu': 1.000, 'pos': 0.000, 'co...",neutral
1,1br9mt3,Eligibility for 485 Visa (Combine 1 yr Bachelo...,2024-03-30T04:59:10.118Z,Please update me on this situation if it's pos...,2024-03-31T11:38:30.032Z,"{'neg': 0.000, 'neu': 0.813, 'pos': 0.187, 'co...",neutral
2,1b12aas,Will I be eligible for subclass 485 after comp...,2024-02-27T03:56:10.775Z,But this is for a separate student visa? So no...,2024-02-27T04:08:05.636Z,"{'neg': 0.084, 'neu': 0.916, 'pos': 0.000, 'co...",negative
3,1b12aas,Will I be eligible for subclass 485 after comp...,2024-02-27T03:56:10.775Z,No is the simple answer. I made the same mista...,2024-03-19T20:58:40.774Z,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.000, 'co...",negative
4,zn1ns7,Grad visa 485 after masters degree (200 points...,2022-12-16T00:47:37.002Z,"I had an issue similar to yours, was granted w...",2022-12-16T14:15:04.577Z,"{'neg': 0.000, 'neu': 0.940, 'pos': 0.060, 'co...",neutral


In [27]:
# Cell 5: Guardian data processing
def clean_guardian_data(df1):
    df1 = df1.drop(columns=['Unnamed: 0'], errors='ignore')
    df1 = df1.reset_index(drop=True)
    df1 = df1.rename(columns={'Title': 'text'})
    return df1

# Load and process Guardian data
guardian_df = pd.read_csv('/content/drive/MyDrive/personal project/guardian_data.csv')
guardian_df = clean_guardian_data(guardian_df)
guardian_df = add_sentiment_analysis(guardian_df, 'text')
guardian_df.to_csv('cleaned_guardian_data.csv', index=False)
print("Guardian data processed and saved.")
guardian_df.head()

Guardian data processed and saved.


Unnamed: 0,text,Publication Date,sentiment_scores,sentiment
0,Andrew Giles released four people from immigra...,2024-06-24T15:00:42Z,"{'neg': 0.200, 'neu': 0.800, 'pos': 0.000, 'co...",negative
1,Australia’s immigration minister suppresses de...,2024-05-07T15:00:20Z,"{'neg': 0.000, 'neu': 1.000, 'pos': 0.000, 'co...",neutral
2,Immigration minister re-cancels visa of allege...,2024-05-28T08:07:50Z,"{'neg': 0.315, 'neu': 0.685, 'pos': 0.000, 'co...",negative
3,"Boost drug search powers, make immigration det...",2024-04-21T20:00:05Z,"{'neg': 0.176, 'neu': 0.634, 'pos': 0.190, 'co...",negative
4,Dutton’s plan to cut immigration would cost Au...,2024-05-19T04:07:13Z,"{'neg': 0.154, 'neu': 0.735, 'pos': 0.110, 'co...",negative


In [28]:
# Cell 6: Final check
print("All data processed and sentiment analysis applied.")
print(f"Twitter dataset shape: {twitter_df.shape}")
print(f"Reddit dataset shape: {reddit_df.shape}")
print(f"Guardian dataset shape: {guardian_df.shape}")

All data processed and sentiment analysis applied.
Twitter dataset shape: (468, 6)
Reddit dataset shape: (2201, 7)
Guardian dataset shape: (269, 4)
