In [3]:
import pandas as pd

In [4]:
df1 = pd.read_excel('ST-GWC Debit ID 6001363_(DISPUTED CLAIMS) 05122023.xlsx')  

In [None]:
df = df1.iloc[:, :-3]

# Handling Abbreviations


In [None]:
abbreviations_df = pd.read_excel('ABBREV.xlsx')

In [None]:
abbreviations = dict(zip(abbreviations_df['Acronym'], abbreviations_df['Full Form']))

In [None]:
def replace_abbreviations(text):
    words = str(text).split()
    for i in range(len(words)):
        if words[i] in abbreviations:
            words[i] = abbreviations[words[i]]
    return ' '.join(words)

In [None]:
df['Technician Comments'] = df['Technician Comments'].apply(replace_abbreviations)
df['Customer Comments'] = df['Customer Comments'].apply(replace_abbreviations)

# Handling spellings

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()

In [None]:
def correct_spelling(text):
    corrected_words = []
    words = text.split()
    
    for word in words:
        corrected_word = spell.correction(word)
        if corrected_word is not None:
            corrected_words.append(corrected_word)
        else:
            corrected_words.append('')  # Replace None with empty string
    
    return ' '.join(corrected_words)

In [None]:
df['Technician Comments'] = df['Technician Comments'].apply(correct_spelling)
df['Customer Comments'] = df['Customer Comments'].apply(correct_spelling)

In [None]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df['SUPPLIER DISPUTES VALIDITY?'].fillna('accepted', inplace=True)
df['SUPPLIER DISPUTES VALIDITY?'].replace('X', 'rejected', inplace=True)

In [None]:
 #Step 1: Data Preparation
df['combined_comments'] = df['Technician Comments'] + " " + df['Customer Comments']
X = df['combined_comments']  # Input features (combined comments)
y = df['SUPPLIER DISPUTES VALIDITY?']  # Output/target variable

In [None]:
# Step 2: Text Preprocessing
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())
    # Remove stop words and punctuations
    words = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Combine words back to a single string
    return " ".join(words)

In [None]:
X = X.apply(preprocess_text)

In [None]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

In [None]:
import joblib

# Save the trained model to a file
model_filename = "supplier_validity_model.pkl"
joblib.dump(model, model_filename)


In [None]:
# Load the saved model from the file
loaded_model = joblib.load(model_filename)

# Example function to get predictions
def get_supplier_validity_prediction(technician_comments, customer_comments):
    # Combine comments
    combined_comments = technician_comments + " " + customer_comments
    # Preprocess text
    preprocessed_comments = preprocess_text(combined_comments)
    # Vectorize the preprocessed comments
    comments_vectorized = vectorizer.transform([preprocessed_comments])
    # Make prediction
    prediction = loaded_model.predict(comments_vectorized)
    return prediction[0]  # Return the predicted supplier validity

# # Example usage
# technician_comments = "The product had some technical issues."
# customer_comments = "I received the product in good condition, but it stopped working after a week."
# predicted_validity = get_supplier_validity_prediction(technician_comments, customer_comments)
# print("Predicted Supplier Validity:", predicted_validity)


In [None]:
# Apply the prediction function to each row in the DataFrame and store results in a new column
df['predicted_validity'] = df.apply(lambda row: get_supplier_validity_prediction(row['Technician Comments'], row['Customer Comments']), axis=1)

# Display the DataFrame with the new 'predicted_validity' column
print(df['predicted_validity'])