In [1]:
from google.colab import drive
import os

In [193]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [254]:
# Dataset path
dataset_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/training.csv"  # Adjust to your path

In [255]:
# Check if the dataset exists
if os.path.exists(dataset_path):
    print(f"Dataset is located at: {dataset_path}")
else:
    print("Dataset folder not found.")

Dataset is located at: /content/drive/MyDrive/TextModel_NaiveBayes/Dataset/training.csv


In [266]:
# Load the dataset
df = pd.read_csv(dataset_path)

In [267]:
# Display the basic info of the dataset
df.info()

num_characters = len(df)
num_words = len(df)
num_sentences = len(df)

print(f"Number of characters: {num_characters}")
print(f"Number of words: {num_words}")
print(f"Number of sentences: {num_sentences}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB
Number of characters: 16000
Number of words: 16000
Number of sentences: 16000


In [268]:
# Map numeric labels to their string names
label_mapping = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear'}
unique_labels = df['label'].unique()
for label in unique_labels:
    print(f"{label}: {label_mapping.get(label, 'Unknown label')}")

0: sadness
3: anger
2: love
5: Unknown label
4: fear
1: joy


In [269]:
# Checking for lowercase conversion
columns_checked = 0
columns_converted = 0
columns_not_converted = 0

# Check for lowercase conversion in the text columns
for column in df.select_dtypes(include=['object']):
    columns_checked += 1
    non_lowercase_rows = df[~df[column].apply(lambda x: x.islower() if isinstance(x, str) else True)]

    if not non_lowercase_rows.empty:
        columns_not_converted += 1
        print(f"Column '{column}' has non-lowercase values:")
        print(non_lowercase_rows[[column]])
    else:
        columns_converted += 1
        print(f"Column '{column}' is already in lowercase.")

# Summary of column checks
print("\nSummary:")
print(f"Total columns checked: {columns_checked}")
print(f"Columns already in lowercase: {columns_converted}")
print(f"Columns with non-lowercase values: {columns_not_converted}")

# Optionally, save rows with non-lowercase values
if columns_not_converted > 0:
    non_lowercase_rows.to_csv('non_lowercase_rows.csv', index=False)
    print("\nRows with non-lowercase values have been saved to 'non_lowercase_rows.csv'.")

Column 'text' is already in lowercase.

Summary:
Total columns checked: 1
Columns already in lowercase: 1
Columns with non-lowercase values: 0


In [270]:
# Checking if the dataset is already tokenized
are_tokenized = df['text'].apply(lambda x: isinstance(x, list)).all()

if are_tokenized:
    print("The dataset is tokenized.")
else:
    print("The dataset is not tokenized.")

The dataset is not tokenized.


In [271]:
# Define unwanted terms
unwanted_terms = ['aa', 'aaaaaaand', 'aaaaand', 'aaaand', 'aac', 'aahhh', 'ab', 'abc']
unwanted_terms_pattern = r'\b(' + '|'.join(map(re.escape, unwanted_terms)) + r')\b'

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text (tokenization and lemmatization without stop word removal)
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove unwanted terms based on the pattern
    cleaned_tokens = [
        word for word in tokens if not re.search(unwanted_terms_pattern, word.lower())
    ]

     # Lemmatize the cleaned tokens
    lemmatized_tokens = [
        lemmatizer.lemmatize(word.lower())  # Lemmatize and convert to lowercase
        for word in cleaned_tokens if word.isalpha()  # Only keep alphabetic tokens
    ]

    # Remove stop words after lemmatization
    final_tokens = [
        word for word in lemmatized_tokens if word not in stop_words
    ]

    return final_tokens

# Apply the preprocessing function to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the first few rows to see the results
print(df[['text', 'processed_text']].head())


                                                text                                     processed_text
0                            i didnt feel humiliated                          [didnt, feel, humiliated]
1  i can go from feeling so hopeless to so damned...  [go, feeling, hopeless, damned, hopeful, aroun...
2   im grabbing a minute to post i feel greedy wrong  [im, grabbing, minute, post, feel, greedy, wrong]
3  i am ever feeling nostalgic about the fireplac...  [ever, feeling, nostalgic, fireplace, know, st...
4                               i am feeling grouchy                                 [feeling, grouchy]


In [272]:
# Define the output file path
output_file_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/proccessed_text_dataOfTraining.csv"

# Save only the 'processed_text' column to a new CSV file
df[['processed_text']].to_csv(output_file_path, index=False)

print(f"Processed data has been saved to: {output_file_path}")

Processed data has been saved to: /content/drive/MyDrive/TextModel_NaiveBayes/Dataset/proccessed_text_dataOfTraining.csv


In [173]:
from sklearn.feature_extraction.text import CountVectorizer

In [263]:
# Dataset path
processedText_path = "/content/drive/MyDrive/TextModel_NaiveBayes/Dataset/proccessed_text_dataOfTraining.csv"  # Adjust to your path

In [273]:
# Load the dataset
df = pd.read_csv(processedText_path)

# Step 2: Prepare the data (assuming 'processed_text' and 'label' are columns)
# Convert list of tokens to string, handling potential NaN values
X_train = df['processed_text'].apply(lambda x: ' '.join(eval(x)) if pd.notna(x) else '')

# Step 3: Feature extraction using Bag of Words (BoW)
bow_vectorizer = CountVectorizer()  # No limit on features
X_train_bow = bow_vectorizer.fit_transform(X_train)

# Step 3a: Count the unique features (terms) in the BoW vectorizer
num_features_bow = len(bow_vectorizer.get_feature_names_out())  # Number of unique terms in BoW
print(f"Number of unique features (terms) using Bag of Words: {num_features_bow}")

# Step 3b: Display the actual features (terms) extracted by BoW
features = bow_vectorizer.get_feature_names_out()
print(f"Unique features (terms) extracted by Bag of Words: \n{features[:20]}")  # Display the first 20 terms

# Optionally, display the feature matrix (document-term matrix)
print("\nDocument-Term Matrix (BoW representation):")
print(X_train_bow.toarray())  # This will show the full document-term matrix (with counts)


Number of unique features (terms) using Bag of Words: 13442
Unique features (terms) extracted by Bag of Words: 
['aaron' 'abandon' 'abandoned' 'abandoning' 'abandonment' 'abated'
 'abbigail' 'abdomen' 'abdominal' 'abducted' 'abelard' 'abhorrent' 'abide'
 'ability' 'abit' 'able' 'ableness' 'abnormally' 'aboard' 'abominable']

Document-Term Matrix (BoW representation):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [33]:
# Step 4: Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()  # No limit on features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Step 4a: Count the unique features (terms) in the TF-IDF vectorizer
num_features_tfidf = len(tfidf_vectorizer.get_feature_names_out())  # Number of unique terms in TF-IDF
print(f"Number of unique features (terms) using TF-IDF: {num_features_tfidf}")

Number of unique features (terms) using TF-IDF: 13462


In [36]:
# Check the tokenized form of the text for a specific row
row_6469_tokens = eval(train_df['processed_text'][6469])
print(f"Tokens for row 6469: {row_6469_tokens}")


Tokens for row 6469: ['feel', 'place', 'posting', 'since', 'feel', 'hesitant', 'join', 'aa', 'full', 'force', 'could', 'use', 'insight', 'people', 'inside']
