In [6]:
import pandas as pd
import numpy as np
from os.path import exists
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import scipy.sparse as sp
from xgboost import XGBClassifier

# Ensure all necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load datasets
trainingSet = pd.read_csv("train.csv")
testingSet = pd.read_csv("test.csv")

# Feature engineering function
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

# Add and preprocess features
def add_features_to(df):
    df['Helpfulness'] = df['HelpfulnessNumerator'] / (df['HelpfulnessDenominator'] + 0.01)
    df['ProcessedText'] = df['Text'].apply(preprocess_text)
    return df
if not exists('X_train.csv'):
    trainingSet = add_features_to(trainingSet)
    testingSet = add_features_to(testingSet)
    trainingSet.to_csv('X_train.csv', index=False)
    testingSet.to_csv('X_submission.csv', index=False)
else:
    trainingSet = pd.read_csv('X_train.csv')
    testingSet = pd.read_csv('X_submission.csv')


# Load the feature extracted files
if exists('X_train.csv'):
    X_train = pd.read_csv("X_train.csv")
if exists('X_submission.csv'):
    X_submission = pd.read_csv("X_submission.csv")

else:
    # Process the DataFrame
    train = add_features_to(trainingSet)
    # Merge
    X_submission = pd.merge(train, testingSet, left_on='Id', right_on='Id')
    X_submission = X_submission.drop(columns=['Score_x'])
    X_submission = X_submission.rename(columns={'Score_y': 'Score'})
    X_train =  train[train['Score'].notnull()]

    X_submission.to_csv("X_submission.csv", index=False)
    X_train.to_csv("X_train.csv", index=False)

print("X_train length is ", len(X_train))
print("X_submission length is ", len(X_submission))

n = 100000
X_train = X_train.sample(n, random_state=42)
print("Now X_train length is ", len(X_train))
print("Now X_submission length is ", len(X_submission))
# Splitting the set
X_train, X_test, Y_train, Y_test = train_test_split(
    X_train.drop(columns=['Score']),
    X_train['Score'],
    test_size=1/4.0,
    random_state=0
)


print(X_train.head())
from collections import Counter
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def get_most_common_words(text_series, n=50):
    all_text = ' '.join(text_series.fillna('').astype(str))


    words = word_tokenize(all_text.lower())

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]


    word_freq = Counter(words)
    return word_freq.most_common(n)

df_combined = pd.concat([X_train, Y_train], axis=1)

for rating in sorted(df_combined['Score'].unique()):
    class_words = get_most_common_words(df_combined[df_combined['Score'] == rating]['Text'])
    for word, count in class_words:
        print(f"{word}: {count}")

all_words = set()
class_word_freq = {}

for rating in sorted(df_combined['Score'].unique()):
    class_words = dict(get_most_common_words(df_combined[df_combined['Score'] == rating]['Text']))
    class_word_freq[rating] = class_words
    all_words.update(class_words.keys())

word_diff = {}
for word in all_words:
    freqs = [class_word_freq[rating].get(word, 0) for rating in sorted(df_combined['Score'].unique())]
    word_diff[word] = max(freqs) - min(freqs)

for word, diff in sorted(word_diff.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(f"{word}: {diff}")



# Initializing
stopwords_set = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
analyzer = SentimentIntensityAnalyzer()

def load_glove_embeddings(file_path):
    try:
        print("Loading GloVe embeddings...")
        embeddings = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.array(values[1:], dtype='float32')
                embeddings[word] = vector
        print("GloVe Embeddings Loaded")
        return embeddings
    except FileNotFoundError:
        print(f"File not found: {file_path}. Please ensure the file path is correct.")
        return None

# Set GloVe file path and load embeddings
glove_path = 'glove.6B/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_path)

def preprocess_words(text):
    """preprocssing test"""
    if isinstance(text, float):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords_set]
    return words

def get_sentence_vector(sentence, model):
    """Get the average vector of setnence"""
    words = preprocess_words(sentence)
    vectors = [model.get(word) for word in words if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

def get_sentiment_score(sentence):
    if isinstance(sentence, float):
        sentence = str(sentence)
    preprocessed_sentence = ' '.join(preprocess_words(sentence))
    return analyzer.polarity_scores(preprocessed_sentence)['compound']

def preprocess_dataframe(df, text_column, summary_column, model):
    print(f"Processing text vectors for {text_column}...")
    df['text_vector'] = df[text_column].apply(lambda x: get_sentence_vector(x, model))

    print("sentiment scores:")
    df['text_sentiment'] = df[text_column].apply(get_sentiment_score)
    df['summary_sentiment'] = df[summary_column].apply(get_sentiment_score)

    # Length
    print("text lengths:")
    df['text_length'] = df[text_column].fillna('').apply(len)
    df['summary_length'] = df[summary_column].fillna('').apply(len)

    df = df.drop(columns=[text_column, summary_column])
    print("Dataframe preprocessing complete.")
    return df

if glove_embeddings:
    X_train = preprocess_dataframe(X_train, 'Text', 'Summary', glove_embeddings)
    X_test = preprocess_dataframe(X_test, 'Text', 'Summary', glove_embeddings)
    X_submission = preprocess_dataframe(X_submission, 'Text', 'Summary', glove_embeddings)

#dropping columns
    columns_to_drop = ['ProductId', 'UserId']
    X_train.drop(columns=columns_to_drop, inplace=True)
    X_test.drop(columns=columns_to_drop, inplace=True)
    X_submission.drop(columns=columns_to_drop, inplace=True)
else:
    print("Preprocessing halted due to missing GloVe embeddings.")
print(X_train.info())


X_train.to_csv("X_train_preprocessed.csv", index=False)
Y_train.to_csv("Y_train_preprocessed.csv", index=False)
X_test.to_csv("X_text_preprocessed.csv", index=False)
Y_test.to_csv("Y_test_preprocessed.csv", index=False)
X_submission.to_csv("X_submission_preprocessed.csv", index=False)
X_train = pd.read_csv("X_train_preprocessed.csv")
Y_train = pd.read_csv("Y_train_preprocessed.csv")
X_test = pd.read_csv("X_text_preprocessed.csv")
Y_test = pd.read_csv("Y_test_preprocessed.csv")
X_submission = pd.read_csv("X_submission_preprocessed.csv")



# Vectorization with TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(trainingSet['ProcessedText'])
X_test_tfidf = tfidf.transform(testingSet['ProcessedText'])

summary_tfidf_train = tfidf.fit_transform(X_train['Summary'].fillna(''))
summary_tfidf_test = tfidf.transform(X_test['Summary'].fillna(''))
summary_tfidf_submission = tfidf.transform(X_submission['Summary'].fillna(''))

summary_count_train = count_vectorizer.fit_transform(X_train['Summary'].fillna(''))
summary_count_test = count_vectorizer.transform(X_test['Summary'].fillna(''))
summary_count_submission = count_vectorizer.transform(X_submission['Summary'].fillna(''))

numeric_features = ['text_sentiment', 'text_length', 'summary_length']

X_train_text_vector = np.vstack(X_train['text_vector'].values) if 'text_vector' in X_train.columns else np.zeros((X_train.shape[0], 300))
X_test_text_vector = np.vstack(X_test['text_vector'].values) if 'text_vector' in X_test.columns else np.zeros((X_test.shape[0], 300))
X_submission_text_vector = np.vstack(X_submission['text_vector'].values) if 'text_vector' in X_submission.columns else np.zeros((X_submission.shape[0], 300))

# Combining features
X_train_combined = sp.hstack([
    sp.csr_matrix(X_train_text_vector),
    summary_tfidf_train,
])

X_test_combined = sp.hstack([
    sp.csr_matrix(X_test_text_vector),
    summary_tfidf_test,
])

X_submission_combined = sp.hstack([
    sp.csr_matrix(X_submission_text_vector),
    summary_tfidf_submission,
])

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y_train_encoded = le.fit_transform(Y_train)

model = xgb.XGBClassifier(
    objective='multi:softmax',
    booster='gbtree',
    num_class=5,
    learning_rate=0.1,
    max_depth=4,
    min_child_weight=3,
    n_estimators=1000,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=1.0,
    random_state=42
)

# Learn the model
model.fit(X_train_combined, Y_train_encoded)

Y_test_predictions = model.predict(X_test_combined)
y_pred = le.inverse_transform(Y_test_predictions)

# Model Evaluation
predictions = model.predict(X_val_tfidf)
predictions = encoder.inverse_transform(predictions)

print("Accuracy on testing set:", accuracy_score(y_val, predictions))
print("Classification Report:\n", classification_report(y_val, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, predictions))

Y_submission_predictions = model.predict(X_submission_combined)
X_submission['Score'] = le.inverse_transform(Y_submission_predictions)
submission = X_submission[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)
from google.colab import files
files.download('submission.csv')

X_train length is  1485341
X_submission length is  212192
Now X_train length is  100000
Now X_submission length is  212192
              Id   ProductId          UserId        Time  \
578158   1490873  B005LAIIQC  A37L1OGFD7SB2I  1355875200   
249432   1149956  B0015LPS1Y   ARC10GZN44C34  1403568000   
1364280    24661  0780020693  A2NUD9S80DZRQG  1209945600   
441970    696595  B000063V8U  A2M2MUKWB2TRVL  1399766400   
906438    305805  6302287375  A338L6RMPYT3ZR  1333411200   

                                       Summary  \
578158                         nothing special   
249432                           love stargate   
1364280                 on the nostalgia wings   
441970   Part of one of the finest series ever   
906438             Love that but disappointed.   

                                                      Text  Helpfulness  
578158   some great moments, nice camera work and great...          0.0  
249432   We had been looking for this series for quite ...         

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


movie: 6991
film: 3494
one: 3207
like: 2980
would: 2138
even: 1972
bad: 1776
time: 1657
good: 1617
could: 1501
really: 1482
get: 1481
see: 1375
story: 1259
movies: 1256
people: 1255
first: 1220
much: 1206
dvd: 1149
make: 1107
made: 1015
watch: 958
way: 955
never: 899
acting: 885
better: 885
know: 881
well: 860
think: 851
quot: 846
ever: 830
plot: 808
two: 801
characters: 796
nothing: 792
great: 780
character: 770
money: 755
seen: 743
watching: 742
show: 738
say: 723
many: 703
back: 692
something: 680
little: 679
want: 678
also: 677
films: 672
original: 654
movie: 6869
film: 4945
one: 3709
like: 3206
good: 2399
would: 2337
really: 2048
even: 1932
story: 1821
much: 1798
get: 1623
time: 1564
see: 1514
could: 1511
first: 1403
bad: 1363
well: 1215
make: 1204
better: 1203
two: 1184
movies: 1149
way: 1143
character: 1130
great: 1129
characters: 1093
people: 1078
made: 1069
think: 1040
also: 1017
quot: 997
plot: 994
dvd: 990
little: 964
know: 946
many: 935
watch: 920
never: 895
scenes: 888
act

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
Index: 75000 entries, 578158 to 1119578
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           75000 non-null  int64  
 1   ProductId    75000 non-null  object 
 2   UserId       75000 non-null  object 
 3   Time         75000 non-null  int64  
 4   Summary      74998 non-null  object 
 5   Text         74997 non-null  object 
 6   Helpfulness  75000 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 4.6+ MB
None


  y = column_or_1d(y, warn=True)


Accuracy on testing set =  0.58052
Classification Report:                precision    recall  f1-score   support

         1.0       0.59      0.27      0.37      1558
         2.0       0.36      0.08      0.13      1409
         3.0       0.50      0.16      0.24      3012
         4.0       0.43      0.14      0.21      5629
         5.0       0.60      0.95      0.73     13392

    accuracy                           0.58     25000
   macro avg       0.50      0.32      0.34     25000
weighted avg       0.54      0.58      0.50     25000

Confusion Matrix:  [[  425    85    73    48   927]
 [  139   109   138   102   921]
 [   54    78   486   336  2058]
 [   44    16   185   776  4608]
 [   54    16    82   523 12717]]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>