In [6]:
import os 
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset, ClassLabel
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import re
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...


True

In [2]:
df = pd.read_excel('Training_data.xlsx')
df.head()

Unnamed: 0,Text,Category,EmailType
0,"Hi *******, Your payment to Uber India was App...",category_3,email_type_93
1,Your Zomato Online Ordering receipt Refund Pro...,category_3,email_type_84
2,Electricity Bill Payment Successful ‚Çπ 979 Fo...,category_3,email_type_3
3,Payment requested by FINCFRIENDS PVT. LTD. Rec...,category_3,email_type_92
4,Greetings from Swiggy Your order was delivered...,category_3,email_type_86


In [3]:
df = df.dropna(axis=0)
df. drop_duplicates(subset=['Text'], inplace=True)
df.shape

(32702, 3)

In [4]:
df.describe(include='all')

Unnamed: 0,Text,Category,EmailType
count,32702,32702,32702
unique,32702,3,143
top,"Hi *******, Your payment to Uber India was App...",category_3,email_type_138
freq,1,28569,11293


In [7]:
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove punctuation
    return text

df['clean_text'] = df['Text'].apply(clean_text)


In [8]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

df['tokens'] = df['clean_text'].apply(word_tokenize)


[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])


In [10]:
df.head()

Unnamed: 0,Text,Category,EmailType,clean_text,tokens
0,"Hi *******, Your payment to Uber India was App...",category_3,email_type_93,hi your payment to uber india was approved pa...,"[hi, payment, uber, india, approved, paid, amo..."
1,Your Zomato Online Ordering receipt Refund Pro...,category_3,email_type_84,your zomato online ordering receipt refund pro...,"[zomato, online, ordering, receipt, refund, pr..."
2,Electricity Bill Payment Successful ‚Çπ 979 Fo...,category_3,email_type_3,electricity bill payment successful ‚çπ 979 fo...,"[electricity, bill, payment, successful, ‚çπ, ..."
3,Payment requested by FINCFRIENDS PVT. LTD. Rec...,category_3,email_type_92,payment requested by fincfriends pvt ltd recei...,"[payment, requested, fincfriends, pvt, ltd, re..."
4,Greetings from Swiggy Your order was delivered...,category_3,email_type_86,greetings from swiggy your order was delivered...,"[greetings, swiggy, order, delivered, 29, minu..."


In [11]:
lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['clean_text'] = df['tokens'].apply(lambda x: ' '.join(x))  # Rejoin tokens to form the text


In [12]:
df.to_csv('Training_data_preprocessed.csv', index=False)