In [7]:
import pandas as pd
from scipy.stats import ks_2samp
from scipy.stats import wasserstein_distance
from sklearn.feature_extraction.text import TfidfVectorizer

# Load train and test data
df = pd.read_csv('../Data/data.csv')
new_df = pd.read_csv('../Data/new_data.csv')
df = df.sample(frac=1).reset_index(drop=True)

# Kolom yang dibandingkan: 'Review'
reviews1 = df['Review'].fillna("")
reviews2 = new_df['Review'].fillna("")

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf1 = vectorizer.fit_transform(reviews1).toarray().flatten()
tfidf2 = vectorizer.transform(reviews2).toarray().flatten()

# Kolmogorov-Smirnov test
ks_stat, ks_pvalue = ks_2samp(tfidf1, tfidf2)
print(f"KS Statistic: {ks_stat}, p-value: {ks_pvalue}")

# Wasserstein distance
wd = wasserstein_distance(tfidf1, tfidf2)
print(f"Wasserstein Distance: {wd}")

KS Statistic: 0.010409912156556511, p-value: 7.842598544668252e-10
Wasserstein Distance: 0.0008718507591024807


In [8]:
# Load train and test data
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')
train_df = train_df.sample(frac=1).reset_index(drop=True)

# Kolom yang dibandingkan: 'Review'
train_reviews = train_df['Review'].fillna("")
test_reviews = test_df['Review'].fillna("")

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_train = vectorizer.fit_transform(train_reviews).toarray().flatten()
tfidf_test = vectorizer.transform(test_reviews).toarray().flatten()

# Kolmogorov-Smirnov test
ks_stat, ks_pvalue = ks_2samp(tfidf_train, tfidf_test)
print(f"KS Statistic (train vs test): {ks_stat}, p-value: {ks_pvalue}")

# Wasserstein distance
wd = wasserstein_distance(tfidf_train, tfidf_test)
print(f"Wasserstein Distance (train vs test): {wd}")

KS Statistic (train vs test): 3.619957063172308e-05, p-value: 1.0
Wasserstein Distance (train vs test): 3.272502854889912e-06


In [5]:
# Kolom yang dibandingkan: 'Review'
test_reviews = test_df['Review'].fillna("")
new_reviews = new_df['Review'].fillna("")

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_test = vectorizer.fit_transform(test_reviews).toarray().flatten()
tfidf_new = vectorizer.transform(new_reviews).toarray().flatten()

# Kolmogorov-Smirnov test
ks_stat, ks_pvalue = ks_2samp(tfidf_test, tfidf_new)
print(f"KS Statistic (test vs new_data): {ks_stat}, p-value: {ks_pvalue}")

# Wasserstein distance
wd = wasserstein_distance(tfidf_test, tfidf_new)
print(f"Wasserstein Distance (test vs new_data): {wd}")

KS Statistic (test vs new_data): 0.011300400097584706, p-value: 1.817932328144194e-11
Wasserstein Distance (test vs new_data): 0.0010197223183568932


In [None]:
import pandas as pd
from scipy.stats import ks_2samp, wasserstein_distance
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# Load train and new_data
train_df = pd.read_csv('../Data/train.csv')
new_df = pd.read_csv('../Data/new_data.csv')

# Kolom yang dibandingkan: 'Review'
train_reviews = train_df['Review'].fillna("")
new_reviews = new_df['Review'].fillna("")

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_train = vectorizer.fit_transform(train_reviews).toarray().flatten()
tfidf_new = vectorizer.transform(new_reviews).toarray().flatten()

# Kolmogorov-Smirnov test
ks_stat, ks_pvalue = ks_2samp(tfidf_train, tfidf_new)
print(f"KS Statistic (train vs new_data): {ks_stat}, p-value: {ks_pvalue}")

# Wasserstein distance
wd = wasserstein_distance(tfidf_train, tfidf_new)
print(f"Wasserstein Distance (train vs new_data): {wd}")