In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv(r"C:\Users\a918147\OneDrive - ATOS\Documents\Python\Text Analysis\Keyword and categories.csv")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Keyword'], df[['Category', 'Subcategory']], test_size=0.2)

# Preprocess the data
le_category = LabelEncoder()
le_subcategory = LabelEncoder()
y_train['Category'] = le_category.fit_transform(y_train['Category'])
y_train['Subcategory'] = le_subcategory.fit_transform(y_train['Subcategory'])
y_test['Category'] = le_category.transform(y_test['Category'])
y_test['Subcategory'] = le_subcategory.transform(y_test['Subcategory'])

# Split the target data into separate columns
y_train_category = y_train['Category']
y_train_subcategory = y_train['Subcategory']

# Create a TfidfVectorizer to convert the text data into numerical data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create separate classifiers for each target column
clf_category = OneVsRestClassifier(SVC())
clf_subcategory = OneVsRestClassifier(SVC())

# Train the classifiers on the vectorized data
clf_category.fit(X_train_vectorized, y_train_category)
clf_subcategory.fit(X_train_vectorized, y_train_subcategory)

# Evaluate the classifiers
score_category = clf_category.score(X_test_vectorized, y_test['Category'])
score_subcategory = clf_subcategory.score(X_test_vectorized, y_test['Subcategory'])
print(f'Category accuracy: {score_category}')
print(f'Subcategory accuracy: {score_subcategory}')

# Load the input CSV file
input_df = pd.read_csv(r"C:\Users\a918147\OneDrive - ATOS\Documents\Python\Text Analysis\Trial for Ticket categorization.csv")

# Extract the text data from the "inc_short_description" column
new_text = input_df['inc_short_description']
new_text = new_text.fillna('')

# Vectorize the new text data
new_text_vectorized = vectorizer.transform(new_text)

# Use the classifiers to predict the categories and subcategories of the new text data
predictions_category = clf_category.predict(new_text_vectorized)
predictions_subcategory = clf_subcategory.predict(new_text_vectorized)

# Inverse transform the predictions to get the original category and subcategory labels
predictions_category = le_category.inverse_transform(predictions_category)
predictions_subcategory = le_subcategory.inverse_transform(predictions_subcategory)

# Add the predictions to the input DataFrame
input_df['Category'] = predictions_category
input_df['Subcategory'] = predictions_subcategory

# Output the results to a new CSV file
input_df.to_csv(r"C:\Users\a918147\OneDrive - ATOS\Documents\Python\Text Analysis\Trial for Ticket categorization - Output.csv", index=False)