# Indian Railways Complaint & Feedback Management System

## Data Analysis and Model Development

This notebook contains the analysis of Indian Railways tweets and development of the classification model.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Load and Explore Data


In [None]:
# Load training data
# Replace with your actual data path
try:
    df = pd.read_csv('data/training_data.csv')
    print(f"Data loaded successfully. Shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())
except FileNotFoundError:
    print("Training data file not found. Creating sample data...")
    # Create sample data
    sample_data = {
        'tweet': [
            "Train is delayed urgent help needed",
            "Excellent service on railway",
            "Accident on track emergency",
            "Thank you for good service",
            "Train breakdown help required",
            "Comfortable journey",
            "Medical emergency in train",
            "Clean and tidy coach",
            "Train cancelled need refund",
            "Great food quality"
        ],
        'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
    }
    df = pd.DataFrame(sample_data)
    print(f"Sample data created. Shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())


## 2. Data Preprocessing


In [None]:
def clean_tweet(text):
    """Clean tweet text"""
    if pd.isna(text):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text.lower().strip()

# Apply cleaning
df['cleaned_tweet'] = df['tweet'].apply(clean_tweet)
print("Data cleaned successfully!")
print("\nSample cleaned tweets:")
print(df[['tweet', 'cleaned_tweet']].head())


## 3. Exploratory Data Analysis


In [None]:
# Class distribution
print("Class Distribution:")
print(df['label'].value_counts())
print(f"\nEmergency (1): {df['label'].sum()}")
print(f"Feedback (0): {(df['label'] == 0).sum()}")

# Visualize class distribution
plt.figure(figsize=(8, 5))
df['label'].value_counts().plot(kind='bar', color=['skyblue', 'coral'])
plt.title('Class Distribution')
plt.xlabel('Class (0=Feedback, 1=Emergency)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


## 4. Feature Extraction


In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['cleaned_tweet'])
y = df['label']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")


## 5. Model Training


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Train Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

print("\nModel trained successfully!")


## 6. Model Evaluation


In [None]:
# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Feedback', 'Emergency']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Feedback', 'Emergency'],
            yticklabels=['Feedback', 'Emergency'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()


## 7. Save Model


In [None]:
import joblib
import os

# Create saved_model directory if it doesn't exist
os.makedirs('saved_model', exist_ok=True)

# Save model and vectorizer
joblib.dump(model, 'saved_model/tweet_classifier_model.pkl')
joblib.dump(vectorizer, 'saved_model/tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully!")
print("Files saved:")
print("  - saved_model/tweet_classifier_model.pkl")
print("  - saved_model/tfidf_vectorizer.pkl")


## 8. Test Predictions


In [None]:
# Test with sample tweets
test_tweets = [
    "Train is delayed by 2 hours, urgent help needed",
    "Great service, comfortable journey",
    "Medical emergency in coach number 5",
    "Thank you for the excellent food"
]

for tweet in test_tweets:
    cleaned = clean_tweet(tweet)
    X_test_tweet = vectorizer.transform([cleaned])
    prediction = model.predict(X_test_tweet)[0]
    proba = model.predict_proba(X_test_tweet)[0]
    
    label = "Emergency" if prediction == 1 else "Feedback"
    confidence = proba[prediction] * 100
    
    print(f"\nTweet: {tweet}")
    print(f"Prediction: {label} (Confidence: {confidence:.2f}%)")
