# Importing Dependencies

In [4]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Setting Up nltk

In [None]:
nltk_data_dir = os.path.join(os.path.expanduser('~'), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)

required_resources = ['punkt', 'stopwords', 'wordnet', 'vader_lexicon', 'averaged_perceptron_tagger']
for resource in required_resources:
    nltk.download(resource, download_dir=nltk_data_dir)

try:
    for resource in required_resources:
        nltk.data.find(resource)
    print("All resources are properly downloaded and verified.")
except Exception as e:
    print(f"Verification failed: {e}")

Verification failed: list index out of range


[nltk_data] Downloading package punkt to C:\Users\Rameez
[nltk_data]     Haider\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rameez
[nltk_data]     Haider\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Rameez
[nltk_data]     Haider\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Rameez
[nltk_data]     Haider\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rameez Haider\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Loading and Cleaning Data

In [None]:
def load_and_clean_csv(file_path):
    data = pd.read_csv(file_path)
    data = data[['Timestamp', 'Content', 'Comments', 'Retweets', 'Likes', 'Analytics']]  
    data.dropna(subset=['Content'], inplace=True) 
    data = convert_columns_to_numbers(data, ['Comments', 'Retweets', 'Likes', 'Analytics'])  
    return data

# Converting Abbreviated Numbers

In [None]:
def convert_columns_to_numbers(df, columns):
    def convert_abbreviated_number(value):
        if isinstance(value, str):
            value = value.upper()
            if 'K' in value:
                return int(float(value.replace('K', '')) * 1_000)
            elif 'M' in value:
                return int(float(value.replace('M', '')) * 1_000_000)
            elif 'B' in value:
                return int(float(value.replace('B', '')) * 1_000_000_000)
        try:
            return int(value)
        except ValueError:
            return value  
    for column in columns:
        if column in df.columns:
            df[column] = df[column].apply(convert_abbreviated_number)
    return df

# Preprocessing Tweets

In [None]:
def preprocess_tweet(tweet):
    tweet = re.sub(r"http\S+|@\S+|#\S+|[^A-Za-z\s]", "", tweet) 
    tweet = tweet.lower() 
    tokens = word_tokenize(tweet) t
    tokens = [word for word in tokens if word not in stopwords.words('english')] 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  
    return " ".join(tokens)

# Clean Data

In [None]:
def extract_mentions(content):
    return re.findall(r'@\w+', content)

def extract_hashtags(content):
    return re.findall(r'#\w+', content)

# Extracting Top Terms and Counting Keywords

In [None]:
def get_top_terms(data, column, n_terms=10):
    vectorizer = CountVectorizer(stop_words='english', max_features=n_terms)
    term_matrix = vectorizer.fit_transform(data[column])
    return vectorizer.get_feature_names_out()

def count_mentions(tweet, keywords):
    return sum(1 for word in tweet.split() if word in keywords)

# Sentiment Analysis

In [None]:
def perform_sentiment_analysis(data):
    sia = SentimentIntensityAnalyzer()
    data['sentiment_textblob'] = data['Content'].apply(lambda x: TextBlob(x).sentiment.polarity)
    data['sentiment_vader'] = data['Content'].apply(lambda x: sia.polarity_scores(x)['compound'])
    return data

# Main Method

In [None]:
def main():

    file_path = "path"  
    data = load_and_clean_csv(file_path)

    data['cleaned_content'] = data['Content'].apply(preprocess_tweet)

    data['Mentions_List'] = data['Content'].apply(extract_mentions)
    data['Hashtags_List'] = data['Content'].apply(extract_hashtags)

    top_mentions = data['Mentions_List'].explode().value_counts().index[:10].tolist()
    top_hashtags = data['Hashtags_List'].explode().value_counts().index[:10].tolist()
    top_terms = get_top_terms(data, 'cleaned_content', n_terms=10)

    keywords = top_mentions + top_hashtags + list(top_terms)

    data['mention_count'] = data['cleaned_content'].apply(lambda x: count_mentions(x, keywords))

    data = perform_sentiment_analysis(data)

    data.to_csv("processed_twitter_data.csv", index=False)
    print("Processed data saved to 'processed_twitter_data.csv'")

if __name__ == "__main__":
    main()


Processed data saved to 'processed_twitter_data.csv'
