In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_excel(r"C:\Users\r.lavanya\Desktop\Intern MF\CHABUU\AKL MULTIDATA.xlsx")
df

Unnamed: 0,User Input,Bot Response,Language,Intents
0,Hello,Hi there!,en,Greeting
1,Vanakkam,Vanakkam! Ungalukku epdi help panradhu?,ta,Greeting
2,Namaskaram,Namaskaram! Meeeku ela sahayam cheyyali?,te,Greeting
3,How are you?,Im doing well!,en,Well-being
4,Epdi irukeenga?,Naan nalla irukken!,ta,Well-being
...,...,...,...,...
1504,pasangalukku oru advice sollunga,"Aaraaychi seyyunga, kelvi kettunga, kathukunga!",ta,Motivation
1505,pillalaki oka advice cheppu,"Aalochinchandi, prasnalu adugandi, chudandi!",te,Motivation
1506,how to manage time,Prioritize and set a simple schedule.,en,Tip
1507,neram epdi manage panna,"Mukkiyamana vishayangalai first panna, schedul...",ta,Tip


In [4]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)


print(df['Intents'].value_counts())

Intents
Info              210
Response          168
Conversational    124
Tip                99
Feeling            88
Intro              81
Humor              81
Motivation         81
Suggest            63
Fact               54
Recommend          47
Reminder           42
Acknowledge        42
Greeting           36
Story              35
Assist             35
Farewell           34
Spell              29
Appreciation       28
Well-being         27
Confirmation       24
Help               23
Define             21
Time               20
Action             14
Fun                 3
Name: count, dtype: int64


In [5]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\d+', '', text)      
    return text.strip()

df['User Input'] = df['User Input'].apply(clean_text)
df['Bot Response'] = df['Bot Response'].astype(str).str.strip()

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Intent Encoded'] = le.fit_transform(df['Intents'])

intent_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(intent_mapping)

{'Acknowledge': 0, 'Action': 1, 'Appreciation': 2, 'Assist': 3, 'Confirmation': 4, 'Conversational': 5, 'Define': 6, 'Fact': 7, 'Farewell': 8, 'Feeling': 9, 'Fun': 10, 'Greeting': 11, 'Help': 12, 'Humor': 13, 'Info': 14, 'Intro': 15, 'Motivation': 16, 'Recommend': 17, 'Reminder': 18, 'Response': 19, 'Spell': 20, 'Story': 21, 'Suggest': 22, 'Time': 23, 'Tip': 24, 'Well-being': 25}


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,5), max_features=4000)
X = vectorizer.fit_transform(df['User Input'])
y = df['Intent Encoded']


In [8]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Before SMOTE:", Counter(y))

sm = SMOTE(random_state=42, k_neighbors=1)
X_resampled, y_resampled = sm.fit_resample(X, y)

print("After SMOTE:", Counter(y_resampled))

Before SMOTE: Counter({14: 210, 19: 168, 5: 124, 24: 99, 9: 88, 13: 81, 16: 81, 15: 81, 22: 63, 7: 54, 17: 47, 18: 42, 0: 42, 11: 36, 3: 35, 21: 35, 8: 34, 20: 29, 2: 28, 25: 27, 4: 24, 12: 23, 6: 21, 23: 20, 1: 14, 10: 3})
After SMOTE: Counter({11: 210, 25: 210, 13: 210, 23: 210, 2: 210, 8: 210, 18: 210, 14: 210, 4: 210, 9: 210, 3: 210, 16: 210, 5: 210, 21: 210, 15: 210, 20: 210, 0: 210, 17: 210, 1: 210, 12: 210, 24: 210, 19: 210, 22: 210, 7: 210, 6: 210, 10: 210})


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Logistic
lr = LogisticRegression(max_iter=300)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("Logistic Accuracy:", accuracy_score(y_test, lr_pred))

# SVM
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))


Logistic Accuracy: 0.8543956043956044
SVM Accuracy: 0.8937728937728938
Random Forest Accuracy: 0.9010989010989011


In [11]:
from sklearn.metrics import classification_report

print("\nClassification Report:\n", classification_report(y_test, svm_pred, target_names=le.classes_))


Classification Report:
                 precision    recall  f1-score   support

   Acknowledge       0.89      1.00      0.94        42
        Action       1.00      1.00      1.00        42
  Appreciation       0.93      1.00      0.97        42
        Assist       0.83      0.90      0.86        42
  Confirmation       0.86      0.88      0.87        42
Conversational       0.68      0.64      0.66        42
        Define       0.98      0.98      0.98        42
          Fact       0.85      0.81      0.83        42
      Farewell       1.00      1.00      1.00        42
       Feeling       0.85      0.83      0.84        42
           Fun       1.00      1.00      1.00        42
      Greeting       0.91      0.98      0.94        42
          Help       0.97      0.90      0.94        42
         Humor       0.93      0.88      0.90        42
          Info       0.55      0.57      0.56        42
         Intro       0.88      0.88      0.88        42
    Motivation       0

In [12]:
model = svm

In [13]:
import random

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text.strip()

def detect_language(user_input):
    text = user_input.lower()
    if any(word in text for word in ['vanakkam', 'epdi', 'sapita', 'enna', 'irukeenga', 'eruka', 'unga', 'naan']):
        return 'ta'
    elif any(word in text for word in ['namaskaram', 'meeru', 'cheppandi', 'nuvvu', 'samsarikum', 'meeeku']):
        return 'te'
    else:
        return 'en'

def is_math_expression(text):
    return bool(re.match(r'^[\d\s\+\-\*/\.\(\)]+$', text.strip()))

def rule_based_response(text):
    text = text.lower()
    if "quote" in text:
        return "🌟 Believe in yourself. Every expert was once a beginner."
    elif "who" in text:
        return "🤖 I'm your friendly chatbot assistant!"
    elif "story" in text:
        return "📖 Once upon a time, a student built the smartest bot with my help!"
    return None

def get_response(user_input):
    cleaned = clean_text(user_input)


    if len(cleaned.split()) < 2 and len(cleaned) < 4:
        return "🤖 Hmm, can you tell me a bit more?"

    if is_math_expression(user_input):
        try:
            result = eval(user_input)
            return f"🧮 The answer is {result}"
        except:
            return "❌ Sorry, I couldn't calculate that."

    rule = rule_based_response(user_input)
    if rule:
        return rule

    lang = detect_language(user_input)

    vector = vectorizer.transform([cleaned])
    pred = model.predict(vector)[0]
    intent = le.inverse_transform([pred])[0]
    
    responses = df[(df['Intents'] == intent) & (df['Language'] == lang)]['Bot Response'].tolist()

    if responses:
        return random.choice(responses)
    else:
        return f"🤖 Sorry, I couldn’t find a reply in {lang.upper()}."

print("🤖 ChatBot is ready! Type 'exit' to stop.\n")

while True:
    inp = input("You: ")
    if inp.lower() == "exit":
        print("Bot: Bye! 👋")
        break
    print("Bot:", get_response(inp))

🤖 ChatBot is ready! Type 'exit' to stop.



You:  Hi


Bot: 🤖 Hmm, can you tell me a bit more?


You:  vanakam


Bot: Hi!


You:  exit


Bot: Bye! 👋


In [107]:
import joblib

joblib.dump(model, 'chatbot_model.pkl')
joblib.dump(vectorizer, 'chatbot_vectorizer.pkl')
joblib.dump(le, 'chatbot_labelencoder.pkl')

print("downloaded")

downloaded


In [14]:
import pickle

# Save model
with open("aklchatbot_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save vectorizer
with open("aklchatbot_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# Save label encoder (if used)
with open("aklchatbot_labelencoder.pkl", "wb") as f:
    pickle.dump(le, f)
