ASSIGNMENT 2 NLP PROJECT ON WORLD CUP TWEET

DATA PREPROCESSING

In [None]:
import pandas as pd
import numpy as np

In [None]:
pip install openpyxl

In [None]:
df = pd.read_excel('/Users/jeevanhr/NLP ASSIGNMENT-01/nlp_project01/World Cup tweets/T20_Worldcup_tweets.xlsx')

In [None]:
df

In [None]:
# Lowercasing the 'self_text' 
df['text'] = df['text'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
df

html tags removal

In [None]:
from bs4 import BeautifulSoup

# Function to remove HTML tags
def remove_html_tags(text):
    if isinstance(text, str):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()
    else:
        return text

# Apply the function to the 'self_text' column
df['text'] = df['text'].apply(remove_html_tags)


In [None]:
df

Removing punctuation

In [None]:
import string

# Function to remove punctuation
def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    else:
        return text

# Apply the function to the 'self_text' column
df['text'] = df['text'].apply(remove_punctuation)

In [None]:
df

Removing emoji

In [None]:
import re

def remove_emojis(text):
    if isinstance(text, str):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002500-\U00002BEF"
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"
                               u"\u3030"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    else:
        return text

df['text'] = df['text'].apply(remove_emojis)




In [None]:
df

spelling checker

In [None]:
pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

# Function to perform spell checking
def correct_spelling(text):
    if isinstance(text, str):
        spell = SpellChecker()
        words = text.split()
        corrected_words = [spell.correction(word) if word.isalpha() else word for word in words]
        cleaned_words = [word for word in corrected_words if word is not None and isinstance(word, str)]
        return ' '.join(cleaned_words)
    else:
        return text

# Take a smaller sample of the data (e.g., the first 1000 rows)
sample_size = 1000
sample = df['text'].iloc[:sample_size]

# Apply the function to the sample
df['text'].iloc[:sample_size] = sample.apply(correct_spelling)


In [None]:
df

Tokenization


In [None]:
pip install nltk

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK resources (if you haven't already)
nltk.download('punkt')

# Tokenize the 'self_text' column
df['text_tokens'] = df['text'].apply(lambda x: word_tokenize(x) if isinstance(x, str) else x)


In [None]:
df

Stop word removal


In [None]:
from nltk.corpus import stopwords

# Download NLTK stopwords (if you haven't already)
nltk.download('stopwords')

# Get the English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(tokens):
    if isinstance(tokens, list):
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        return filtered_tokens
    else:
        return tokens

# Apply stopword removal to the tokenized column
df['text_tokens_without_stopwords'] = df['text_tokens'].apply(remove_stopwords)


In [None]:
df

Stemming


In [None]:
# Download NLTK resources (if you haven't already)
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


In [None]:
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
porter = PorterStemmer()

# Function to perform stemming
def perform_stemming(tokens):
    if isinstance(tokens, list):
        stemmed_tokens = [porter.stem(word) for word in tokens]
        return stemmed_tokens
    else:
        return tokens

# Apply stemming to the tokens without stopwords
df['text_stemmed'] = df['text_tokens_without_stopwords'].apply(perform_stemming)


In [None]:
df

Lemmatizer

In [None]:
from nltk.stem import WordNetLemmatizer

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to perform lemmatization
def perform_lemmatization(tokens):
    if isinstance(tokens, list):
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return lemmatized_tokens
    else:
        return tokens

# Apply lemmatization to the stemmed tokens
df['text_lemmatized'] = df['text_stemmed'].apply(perform_lemmatization)


In [None]:
df

Sentiment Analysis

In [None]:
from textblob import TextBlob

# Function to find sentiment polarity and label
def find_sentiment(text):
    if isinstance(text, list):
        analysis = TextBlob(' '.join(text))
        polarity = analysis.sentiment.polarity

        # Classify sentiment based on polarity
        if polarity > 0:
            sentiment = 'Positive'
        elif polarity < 0:
            sentiment = 'Negative'
        else:
            sentiment = 'Neutral'

        return polarity, sentiment
    else:
        return None, None

# Apply sentiment analysis and labeling
df['sentiment_polarity'], df['sentiment'] = zip(*df['text_lemmatized'].apply(find_sentiment))


In [None]:
df

In [None]:
#df.to_csv('world_cup001.csv', index=False)

LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Assuming 'text_lemmatized' contains preprocessed and tokenized text
text_data = df['text_lemmatized'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Vectorizing text data
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
tf = vectorizer.fit_transform(text_data)

# Applying LDA
num_topics = 5  # Number of topics to identify (adjust as needed)
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(tf)

# Transforming text data into topic distributions
topic_distributions = lda.transform(tf)

# Adding topic distribution features to the DataFrame
for i in range(num_topics):
    df[f"topic_{i+1}_distribution"] = topic_distributions[:, i]

# Display the DataFrame with topic distribution features
print(df.head())

In [None]:
df

Contextual Analysis

In [None]:
# Extracting topic distribution columns
topic_columns = [col for col in df.columns if col.startswith('topic_')]

# Creating a DataFrame for topic distributions
topic_df = df[topic_columns]

# Plotting topic distribution
plt.figure(figsize=(10, 6))
topic_df.sum().plot(kind='bar', stacked=True)
plt.title('Topic Distribution')
plt.xlabel('Topics')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
from collections import Counter

# Function to extract hashtags and words from user_description
def extract_hashtags_and_words(text):
    if isinstance(text, str):
        words = text.split()
        hashtags = [word.strip("#") for word in words if word.startswith("#")]
        return hashtags, words
    else:
        return [], []

# Apply the function to extract hashtags and words
df['hashtags'], df['description_words'] = zip(*df['user_description'].apply(extract_hashtags_and_words))

# Count word frequencies next to hashtags
word_freq_next_to_hashtags = Counter()
for hashtags, words in zip(df['hashtags'], df['description_words']):
    for i, tag in enumerate(hashtags):
        if i < len(words) - 1:
            word_freq_next_to_hashtags.update([f"{tag}_{words[i+1]}"])

# Get top 10 repeated words next to hashtags
top_10_words_next_to_hashtags = word_freq_next_to_hashtags.most_common(10)
print(top_10_words_next_to_hashtags)


In [None]:
df.to_csv('world_cup001.csv', index=False)