In [None]:
!pip install -U imbalanced-learn scikit-learn

In [None]:
!pip install --upgrade scikit-learn imbalanced-learn


In [None]:
import numpy as np
import pandas as pd
import os
import tweepy as tw
import re    # it is RegEx useed to remove non-letter characters
import nltk

nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *

# For Building the model
from sklearn.model_selection import train_test_split
import tensorflow as tf
import seaborn as sns

#For data visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

pd.options.plotting.backend = "plotly"


!pip install emoji

!pip install tweepy

## Fetching the newly created merged dataset using a Dataframe

In [None]:
df1 = pd.read_csv('merged_raw_dataset.csv').sample(n=100000) 

In [None]:
df1.head()

In [None]:
# Number of records in the dataset
row_count = df1.shape
print("Number of Records in the raw dataset: ",row_count)

# Finding the unique App Domains in the Merged Dataset

In [None]:
unique_app_domains = df1['App Domain'].unique()

# Print or use the unique values
print(unique_app_domains)

# Finding the unique App names in the merged dataset

In [None]:
unique_app_name = df1['AppName'].unique()

# Print or use the unique values
print(unique_app_name)

# Intial Descriptive Statistics

In [None]:
df1.describe()

# Intial EDA

In [None]:
df1['content'] = df1['content'].astype(str)
# Calculate the length of each review
df1['ReviewCharLength'] = df1['content'].apply(len)

print(df1['ReviewCharLength'])
# Plot a box plot for review lengths
plt.figure(figsize=(6, 4))
sns.boxplot(y=df1['ReviewCharLength'])
plt.title('Number of characters in the Review ')
plt.xlabel('Review Character Length')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Calculate the length of each review
df1['ReviewCharLength'] = df1['content'].apply(len)

# Calculate histogram values
hist, bins = np.histogram(df1['ReviewCharLength'], bins=20)

# Plot a histogram for review lengths
plt.figure(figsize=(8, 5))
plt.hist(df1['ReviewCharLength'], bins=40, color='skyblue', edgecolor='black')
plt.title('Histogram for the Character Count of the Reviews')
plt.xlabel('Review Character Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
import pandas as pd
import regex
from collections import Counter
import matplotlib.pyplot as plt

# Assuming df1 is your DataFrame
# Function to extract special characters from text
def extract_special_characters(text):
    special_characters_list = []
    special_characters_pattern = regex.compile(r'\p{S}')
    special_characters = special_characters_pattern.findall(text)
    for char in special_characters:
        special_characters_list.append(char)
    return special_characters_list

# Apply the function to the "content" column in the 'df1' dataframe
df1['special_characters'] = df1['content'].apply(extract_special_characters)

# Flatten the list of special characters
flat_special_characters_list = [item for sublist in df1['special_characters'] for item in sublist]

# Count the occurrences of each special character
special_character_counts = dict(Counter(flat_special_characters_list))

# Sort the special characters by count
sorted_special_characters = dict(sorted(special_character_counts.items(), key=lambda item: item[1], reverse=True))

# Get the top 10 most used special characters
top_10_special_characters = dict(list(sorted_special_characters.items())[:10])

# Plot the top 10 most used special characters
plt.figure(figsize=(10, 6))
plt.bar(top_10_special_characters.keys(), top_10_special_characters.values())
plt.title('Top 10 Most Used Special Characters in Reviews')
plt.xlabel('Special Character')
plt.ylabel('Count')
plt.show()

# Find the special character with the highest count
highest_count_character = max(special_character_counts, key=special_character_counts.get)
highest_count = special_character_counts[highest_count_character]

# Display the result
print(f"The special character with the highest count is '{highest_count_character}' with a count of {highest_count}")

# Display all top 10 characters and their counts
print("Top 10 Special Characters:")
for char, count in top_10_special_characters.items():
    print(f"{char}: {count}")


In [None]:
df1.head()

In [None]:
# Sentiment Analysis Bar Chart
plt.figure(figsize=(8, 4))
sns.countplot(x='score', data=df1)
plt.title('Distribution of Customer Ratings')
plt.xlabel('Customer Rating')
plt.ylabel('Count')
plt.show()

# Data Cleaning

In [None]:
# Number of records in the dataset
row_count = df1.shape
print("Number of Records in the raw dataset: ",row_count)

#Handling Null values


In [None]:
#Checking the number of null values before the removal of null values in content

print("The number of null values in the Dataset:\n",df1.isnull().sum())

In [None]:
#Removing the null values in the userName Column
df1.dropna(subset=['userName'], inplace=True)

In [None]:
#Checking the number of null values before the removal of null values in content

print("The number of null values in the Dataset:\n",df1.isnull().sum())

In [None]:
# Sort by 'at'
df1= df1.sort_values('at')

# Forward fill 'appVersion'
df1['appVersion'] = df1['appVersion'].ffill()

# Backward fill 'appVersion' for remaining null values
df1['appVersion'] = df1['appVersion'].bfill()

print(df1['appVersion'])
print("The above values are the filled App version values  ")

In [None]:
# Sort by 'at'
df1 = df1.sort_values('at')

# Forward fill 'appVersion'
df1['reviewCreatedVersion'] = df1['reviewCreatedVersion'].ffill()

# Backward fill 'appVersion' for remaining null values
df1['reviewCreatedVersion'] = df1['reviewCreatedVersion'].bfill()

print(df1['reviewCreatedVersion'])
print("The above values are the filled reviewCreated Version values  ")

In [None]:
#Checking the number of null values before the removal of null values in content

print("The number of null values in the Dataset:\n",df1.isnull().sum())

In [None]:
# Impute missing text with a default NA value
df1.fillna('NA', inplace=True)

In [None]:
#Checking the null values after the removal
df1.isnull().sum()

# Number of Records left after removing null values

In [None]:
# Count the number of rows left after removing the null values in Review content column
row_count = df1.shape[0]
print("The number of records left after removing all the null values:", row_count)

# Handling Duplicates

In [None]:
#Checking if there duplicates in the review id column to verify whether to consider the removal of duplicate column
print("Number of duplicates in the ReviewidColumn: ",len(df1['reviewId'])-len(df1['reviewId'].drop_duplicates()))

In [None]:
# Drop duplicates based on a particular column
df_no_duplicates = df1.drop_duplicates(subset='reviewId',inplace=True)
print("The number of duplicates left: ",df_no_duplicates)

In [None]:
#Check the count of duplicates in Review Content Coumn

print("Number of duplicates in the Content Column: ",len(df1['content'])-len(df1['content'].drop_duplicates()))

# Number of records after removing duplicates

In [None]:
# Count the number of rows left after removing the null values in Review content column
row_count = df1.shape[0]
print("The number of records  after removing duplicates in the review id column: ",row_count)

# Handling emoji data

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the function to the "content" column
df1['content'] = df1['content'].apply(remove_emoji)

In [None]:
# Filter the DataFrame to include only rows with emoji
emoji_rows = df1[df1['content'] == True]

# Print the rows with emoji
print(emoji_rows)

# Handling Special characters

In [None]:
# Function to remove special characters from text and replace with a space
def remove_special_characters(text):
    # Define a regular expression pattern to match special characters
    special_characters_pattern = r':'

    # Use re.sub to replace matched special characters with a space
    cleaned_text = re.sub(special_characters_pattern, ' ', text)

    return cleaned_text

# Apply the function to remove special characters and replace with spaces
df1['content'] = df1['content'].apply(remove_special_characters)

In [None]:
# Filter rows with non-empty 'removed_characters' lists (indicating special characters were removed)
df1 = df1[df1['content'].str.len() > 0]

# Display the filtered DataFrame
print(df1['content'])

In [None]:
# Count the number of rows left after removing the null values in Review content column
row_count = df1.shape
print("The number of records after Data Cleaning ", row_count)

# Data Transformation
## Data Discretization

In [None]:
def classify_score(value):
    if value <3:
        return -1
    elif value == 3:
        return 0
    elif value >3:
        return 1

df1['category'] = df1['score'].apply(classify_score)

df1['category'].value_counts()

In [None]:
df1['category'] = df1['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})
df1.head()


# Displaying Postive distribution

In [None]:
# Distributing all +ve sentiment tweets
import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(14,7))
df1['word_count'] = df1.content.str.split().apply(len)
ax1 = fig.add_subplot(122)
# Specify the bin size (adjust as needed)
bin_size = 40

# Use seaborn to create a histogram plot with specified bin size
sns.histplot(df1[df1['category'] == 'Positive']['word_count'], ax=ax1, color='green', bins=bin_size)

describe = df1.word_count[df1.category=='Positive'].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for positive sentiment Reviews.', fontsize=16)

plt.show()

# Dispalying Negative Distribution

In [None]:
# Distributing all -ve sentiment tweets
fig = plt.figure(figsize=(14,7))
df1['word_count'] = df1.content.str.split().apply(len)
ax1 = fig.add_subplot(122)
# Specify the bin size (adjust as needed)
bin_size = 40
sns.histplot(df1[df1['category']=='Negative']['word_count'], ax=ax1,color='red',bins=bin_size)
describe = df1.word_count[df1.category=='Negative'].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for Negative sentiment Reviews.', fontsize=16)

plt.show()

# Displaying Neutral Distribution

In [None]:
# Distributing all -ve sentiment tweets
fig = plt.figure(figsize=(14,7))
df1['word_count'] = df1.content.str.split().apply(len)
ax1 = fig.add_subplot(122)

# Specify the bin size (adjust as needed)
bin_size = 40
sns.histplot(df1[df1['category']=='Neutral']['word_count'], ax=ax1,color='Yellow',bins=bin_size)
describe = df1.word_count[df1.category=='Neutral'].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for Neutral sentiment Reviews.', fontsize=16)

plt.show()

# Wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

def wordcount_gen(df, category):
    '''
    Generating Word Cloud
    inputs:
       - df: tweets dataset
       - category: Positive/Negative/Neutral
    '''
    # mixing all tweets
    combined_tweets = " ".join([tweet for tweet in df[df1.category==category]['content']])

    # starting wordcloud object
    wc = WordCloud(background_color='white',
                   max_words=50,
                   stopwords = STOPWORDS)

    # Create and plot wordcloud
    plt.figure(figsize=(5,5))
    plt.imshow(wc.generate(combined_tweets))
    plt.title('{} Sentiment Words'.format(category), fontsize=30,color='black')
    plt.axis('off')
    plt.show()

# +ve tweet words
wordcount_gen(df1, 'Positive')

# -ve tweet words
wordcount_gen(df1, 'Negative')

# Neutral tweet words
wordcount_gen(df1, 'Neutral')



In [None]:
# Count the number of rows left after removing the null values in Review content column
row_count = df1.shape
print("The number of records  after removing duplicates in the review id column: ",row_count)

In [None]:
# Example of viewing the processed content
print(df1[['content']])

# Data Normalization

In [None]:
# English alphabets
english_alphabets = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

# Function to check if the string starts with an English character
def starts_with_english(s):
    if isinstance(s, str):
        words = s.split()
        if words:
            return words[0][0] in english_alphabets
    return False

# Ensure 'content' column exists
if 'content' in df1.columns:
    # Filtering the DataFrame
    df1 = df1[df1['content'].apply(starts_with_english)]

# Displaying the first few rows of the filtered DataFrame
    print(df1.head())
else:
    print("The 'content' column does not exist in the DataFrame.")


In [None]:
# Example of viewing the processed content
print(df1[['content']])

In [None]:
df1.head()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Ensure necessary NLTK data is downloaded
nltk_data = ['punkt', 'wordnet', 'stopwords']
for data in nltk_data:
    nltk.download(data, quiet=True)

def tweet_to_words(tweet):
    if not isinstance(tweet, str):
        return []  # Return an empty list for non-string input

    # Lowercase the text
    text = tweet.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
    
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Example DataFrame initialization (hypothetical)
# df1 = pd.DataFrame({
#     'content': ['Your tweet content here', 'Another tweet content'],
#     'category': ['Category1', 'Category2']
# })

# Apply the function to the 'content' column
df1['processed_content'] = df1['content'].apply(tweet_to_words)

# Safely accessing the first row
if not df1.empty:
    print("\nOriginal tweet ->", df1['content'].iloc[0])
    print("\nProcessed tweet ->", df1['processed_content'].iloc[0])

# Applying data processing to individual data
X = df1['processed_content'].apply(tweet_to_words).tolist()

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y = le.fit_transform(df1['category'])

# Displaying the processed content and category for the first row, if available
if len(X) > 0:
    print("\nProcessed Content of the first tweet:", X[0])
    print("Category of the first tweet:", Y[0])


In [None]:
# Number of records in the dataset
row_count = df1.shape
print("Number of Records after Data Transformation ",row_count)

In [None]:
# Example of viewing the processed content
print(df1[['content']].tail())

# Handling Inconsistent Data

### Checking User Rating and Review Mismatch

In [None]:
#Check for mismatched ratings and comments (where we have a 5-point rating scale)
df1['rating_comment_mismatch'] = df1.apply(
    lambda row: (row['score'] > 3 and 'worst' in row['content']) or
                (row['score'] < 3 and 'good' in row['content']), axis=1)

In [None]:
#To check the count of rating mismatch
print("The number of mismatch between the Review content and Reviewer Score:",df1['rating_comment_mismatch'].value_counts()[True])

In [None]:
 #Checking the rating mismatch
print(df1['rating_comment_mismatch'])

## Inconsistent datatypes

In [None]:
df1.dtypes

## Converting 'At' Column representing Timestamp of Review to Standard Date & Time Format

In [None]:
# Convert date to a standard format (e.g., YYYY-MM-DD)
df1['at'] = pd.to_datetime(df1['at'], errors='coerce')  # 'coerce' will set invalid parsing as NaT

In [None]:
df1.dtypes

##Removing Outliers

In [None]:
# Removing outliers
rating_mean = df1['score'].mean()
rating_std = df1['score'].std()
df1 = df1[(df1['score'] > rating_mean - 3 * rating_std) & (df1['score'] < rating_mean + 3 * rating_std)]

In [None]:
#Check the unique values in Score Column
unique_values = df1['score'].unique()
print(unique_values)

#Handling  HTML Tags

In [None]:
#Handling HTML Tags (if applicable)
df1['content'] = df1['content'].str.replace('<[^<]+?>', '', regex=True)


In [None]:
def has_html_tags(text):
    # Define a regular expression pattern for HTML tags
    html_tags_pattern = re.compile(r'<.*?>')

    # Use findall to get all matches of the pattern in the text
    html_tags = re.findall(html_tags_pattern, text)

    # Check if any HTML tags were found
    return bool(html_tags)

# Apply the function to each row of the DataFrame
df1['has_html_tags'] = df1['content'].apply(has_html_tags)

# Display the DataFrame
print("Is there any HTML Tags in the Text?:\n",df1['has_html_tags'])

In [None]:
# Example of viewing the processed content
print(df1[['content']].tail())

In [None]:
# Number of records in the dataset
row_count = df1.shape
print("Number of Records after Data Transformation ",row_count)

In [None]:
df1.isnull().sum()

#Scaling Numerical Features

In [None]:
from sklearn.preprocessing import MinMaxScaler
df1['ReviewCharLength'] = df1['content'].apply(len)
scaler = MinMaxScaler()
df1[['thumbsUpCount', 'ReviewCharLength']] = scaler.fit_transform(df1[['thumbsUpCount', 'ReviewCharLength']])

In [None]:
df1.head()

#Data Reduction

In [None]:
columns_to_drop = ['Unnamed: 0','reviewId', 'userName', 'userImage', 'replyContent', 'repliedAt','special_characters','rating_comment_mismatch','has_html_tags' ]

reduced_df = df1.drop(columns=columns_to_drop, axis=1)

reduced_df.head()

column_names = reduced_df.columns.tolist()
print(column_names)

In [None]:
reduced_df.shape

In [None]:
# Select two columns for which you want to find covariance and correlation
column1 = 'score'

column2 = 'thumbsUpCount'

# Calculate covariance
covariance_value = df1[column1].cov(df1[column2])

# Calculate correlation
correlation_value = df1[column1].corr(df1[column2])

# Display the results
print(f'Covariance between {column1} and {column2}: {covariance_value}')
print(f'Correlation between {column1} and {column2}: {correlation_value}')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Create a histogram using Seaborn
sns.histplot(df1['score'], bins=10, kde=True, color='skyblue', edgecolor='black')
plt.xticks([1, 2, 3, 4, 5])
# Add labels and title
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title('Distribution of Scores')

# Display the plot
plt.show()

In [None]:
reduced_df.tail()

In [None]:
reduced_df.shape

# EDA :Covariance Matrix

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
# For example, df = pd.read_csv('your_dataset.csv')

# Calculate the correlation matrix
correlation_matrix = reduced_df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Create a heatmap using Seaborn
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

# Display the heatmap
plt.title("Correlation Heatmap")
plt.show()


In [None]:
#calculating vocab size to create word corpus
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

# Define the tweet processing function
def tweet_to_words(tweet):
    text = tweet.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    words = [PorterStemmer().stem(w) for w in words]
    return words

# Process all tweets and create a list of all words
all_words = []
for tweet in reduced_df['content']:
    if isinstance(tweet, str):
        all_words.extend(tweet_to_words(tweet))

# Determine the vocabulary size
vocabulary_size = len(set(all_words))
print("Total Vocabulary Size:", vocabulary_size)


## Performing topic modeling using LDA

In [None]:
import pandas as pd
from collections.abc import Iterable  # For checking if the value is an iterable
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to join tokens and handle non-iterable values
def join_tokens(tokens):
    if isinstance(tokens, Iterable) and not isinstance(tokens, str):  # Check if tokens is an iterable but not a string
        return ' '.join(tokens)
    return ''  # Replace non-iterable values with an empty string

# Apply the function to your DataFrame
reduced_df['processed_text'] = reduced_df['processed_content'].apply(join_tokens)

# Example of further code
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = count_vectorizer.fit_transform(reduced_df['processed_text'])

num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda.fit(dtm)

# Function to display the top words in each topic
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d:" % (topic_idx + 1)] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

# Displaying the top words for each topic
no_top_words = 10
topic_words = display_topics(lda, count_vectorizer.get_feature_names_out(), no_top_words)
print(topic_words)


In [None]:
# Assuming `dtm` is the document-term matrix for the comments
# and `lda` is the trained LDA model
topic_distributions = lda.transform(dtm)

# The indices of topics of interest
topics_of_interest = [7, 4, 8, 1]  # these indices are one less than the topic numbers because of 0-indexing

# Filter the topic distributions to only include the four topics of interest
filtered_distributions = topic_distributions[:, topics_of_interest]

# Get the index of the topic with the max probability for each comment
# The index will correspond to the position in topics_of_interest
topic_indices = filtered_distributions.argmax(axis=1)

# Map the index to the class names
class_names = {0: "app functionality", 1: "customer service", 2: "user friendly", 3: "payment experience"}

# Use the mapping to translate indices to class names
reduced_df['topic_label'] = [class_names[index] for index in topic_indices]

# Now reduced_df will have a new column 'topic_label' with class names as labels


In [None]:
print(reduced_df[['content', 'topic_label']])

## WORD2VEC

In [None]:
# creating word2vec nad spling the data
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def vectorize_text(word_list, model):
    # Remove out-of-vocabulary words
    word_list = [word for word in word_list if word in model.wv.key_to_index]
    
    # If no words are left in the sentence after filtering, return a zero vector
    if len(word_list) == 0:
        return np.zeros(model.vector_size)
    
    # Compute the average Word2Vec for the remaining words
    word_vectors = [model.wv[word] for word in word_list]
    vectorized_text = np.mean(word_vectors, axis=0)
    return vectorized_text

# Fill NaN values with an empty list in 'processed_content' column
reduced_df['processed_content'] = reduced_df['processed_content'].apply(lambda x: [] if isinstance(x, float) else x)

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=reduced_df['processed_content'], vector_size=100, window=5, min_count=1, workers=4)

# Apply the vectorization function to each row in the DataFrame
X = np.array([vectorize_text(tokens, word2vec_model) for tokens in reduced_df['processed_content']])

# Label encoding for the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(reduced_df['category'])

# Splitting the data into train, validation, and test sets with stratification
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Now, X_train, X_val, X_test, y_train, y_val, and y_test are ready for further use


In [None]:
# Print the shapes of the datasets
print("Training Set Shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Validation Set Shape (X_val, y_val):", X_val.shape, y_val.shape)
print("Test Set Shape (X_test, y_test):", X_test.shape, y_test.shape)




In [None]:
#displaying word2vec result
# Create a new DataFrame with the vector data
df_vectors = pd.DataFrame(X, columns=[f'vector_{i}' for i in range(X.shape[1])])

# Ensure that the length of reduced_df matches the length of X
if len(reduced_df) == len(X):
    for i in range(X.shape[1]):
        reduced_df[f'vector_{i}'] = X[:, i]

# Print the first few rows
print("First few rows of the DataFrame with Word2Vec vectors:")
df_vectors.head()


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 21688
max_len = 100

def tokenize_pad_sequences(text):
    '''
    This function tokenizes the input text into sequences of integers and then
    pads each sequence to the same length.
    '''
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    return X, tokenizer

# Ensure that the DataFrame is not empty and has enough rows
if not reduced_df.empty and len(reduced_df) > 5:
    print('Before Tokenization & Padding \n', reduced_df['content'].iloc[5])
    X, tokenizer = tokenize_pad_sequences(reduced_df['content'])
    print('After Tokenization & Padding \n', X[5])
else:
    print("DataFrame is empty or does not have enough rows.")


In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
#performing SMOTE analysis
# Assuming X_train, y_train are your training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Now, X_train_smote and y_train_smote are your new training sets with balanced classes

# Displaying the shape of the datasets
print("Original Training Set Shape:", X_train.shape, y_train.shape)
print("SMOTE Resampled Training Set Shape:", X_train_smote.shape, y_train_smote.shape)


In [None]:
#checking distribution of smote data
import numpy as np

unique, counts = np.unique(y_train_smote, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution in y_train_smote:", class_distribution)


## BILSTM

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt


In [None]:
'''import tensorflow as tf
from tensorflow.keras.metrics import Metric

class F1Score(Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = Precision()
        self.recall = Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()'''


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import SGD as LegacySGD
from keras.callbacks import LearningRateScheduler, EarlyStopping

# Model parameters
embedding_size = 100
max_len = 100
epochs = 10
learning_rate = 0.1
momentum = 0.8
# SGD Optimizer
sgd = LegacySGD(learning_rate=learning_rate, momentum=momentum, nesterov=False)
# Learning Rate Scheduler
def scheduler(epoch, lr):
    if epoch % 5 == 0 and epoch != 0:
        return lr * 0.1
    else:
        return lr

lr_scheduler = LearningRateScheduler(scheduler)
# Early Stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

# Building the Model with two BiLSTM layers
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(max_len, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32)))  
model.add(Dropout(0.4))
model.add(Dense(3, activation='softmax'))  # Adjust the number of units to match the number of classes

# Model Summary
model.summary()





In [None]:
import tensorflow as tf
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
# Compile the Model
model.compile(optimizer=sgd, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
history = model.fit(X_train_smote, y_train_smote,
                    batch_size=32,  # Ensure this matches the actual batch size
                    epochs=epochs,
                    validation_data=(X_val, y_val),
                    callbacks=[lr_scheduler, early_stopping],
                    verbose=1)


In [None]:
import matplotlib.pyplot as plt

# Plotting training and validation accuracy
plt.figure(figsize=(8, 2))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plotting training and validation loss
plt.figure(figsize=(8, 2))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Assuming X_val is your validation data and is already preprocessed
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predictions to class labels


In [None]:
from sklearn.metrics import confusion_matrix

# Assuming y_val are the true labels
confusion_mtx = confusion_matrix(y_val, y_pred_classes)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting
plt.figure(figsize=(10,6))
sns.heatmap(confusion_mtx, annot=True, fmt='d', cmap="Blues")
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score


In [None]:
# Calculate metrics
f1 = f1_score(y_val, y_pred_classes, average='weighted')
precision = precision_score(y_val, y_pred_classes, average='weighted')
recall = recall_score(y_val, y_pred_classes, average='weighted')

# Print the metrics
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


In [None]:
# Define your desired path
model_save_path = 'CNN_BiLSTM.h5'  # Replace with your desired path

# Save the model in HDF5 format to the specified path
model.save(model_save_path)


In [None]:
# Assuming X_test and y_test are your preprocessed test data and labels
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


In [None]:
from tensorflow.keras.models import load_model

# Define the path where the model is saved
model_save_path = 'CNN_BiLSTM.h5'

# Load the model
loaded_model = load_model(model_save_path)

print("Model loaded successfully.")
