In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
calls_df = pd.read_csv('calls.csv')  # Contains 'call_id', 'call_transcript'
reasons_df = pd.read_csv('reasons.csv')  # Contains 'call_id', 'primary_call_reason'
test_df = pd.read_csv('test.csv')  # Contains 'call_id' for which we need to predict reasons

# Step 1: Clean the transcript text
def clean_transcript(text):
    # Remove comments enclosed in "**", punctuation and convert to lowercase
    text = re.sub(r'\*\*.*?\*\*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

calls_df['cleaned_transcript'] = calls_df['call_transcript'].apply(clean_transcript)

# Step 2: Merge calls.csv and reasons.csv to create a training dataset
train_df = pd.merge(calls_df[['call_id', 'cleaned_transcript']], reasons_df, on='call_id')

# Step 3: Prepare data for model
X = train_df['cleaned_transcript']
y = train_df['primary_call_reason']

# Step 4: Split the training data for evaluation purposes
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Convert text into features using CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# Step 6: Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_vec, y_train)

# Step 7: Predict on validation set and calculate accuracy
y_val_pred = clf.predict(X_val_vec)
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred)}')
print(classification_report(y_val, y_val_pred))

# Step 8: Predict on test.csv
test_df = pd.merge(test_df, calls_df[['call_id', 'cleaned_transcript']], on='call_id', how='left')
X_test = vectorizer.transform(test_df['cleaned_transcript'])
test_df['predicted_reason'] = clf.predict(X_test)

# Save predictions to a CSV file
test_df[['call_id', 'predicted_reason']].to_csv('test_predictions.csv', index=False)



Validation Accuracy: 0.20051008926562147


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                         precision    recall  f1-score   support

                Baggage       0.00      0.00      0.00        21
                Baggage       0.00      0.00      0.00       554
              Baggage         0.00      0.00      0.00        29
                Booking       0.00      0.00      0.00       497
              Booking         0.00      0.00      0.00        16
               Check In       0.00      0.00      0.00        76
               Check-In       0.00      0.00      0.00       276
             Check-In         0.00      0.00      0.00         7
               Checkout       0.00      0.00      0.00       373
             Checkout         0.00      0.00      0.00        11
         Communications       0.00      0.00      0.00       746
       Communications         0.00      0.00      0.00        11
      Digital   Support       0.00      0.00      0.00        32
       Digital  Support       0.00      0.00      0.00        19
        Digital Support 