In [1]:
!pip install textstat seaborn



In [None]:
# !pip install --upgrade numpy scipy nltk scikit-learn textstat torch pandas

In [None]:
import textstat
import nltk
from nltk.corpus import wordnet
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def compute_metrics(text):
    return {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "gunning_fog": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "automated_readability_index": textstat.automated_readability_index(text),
        "lexical_diversity": len(set(text.split())) / len(text.split()) if len(text.split()) > 0 else 0,
        "syllable_count": textstat.syllable_count(text),
        "complex_word_count": textstat.difficult_words(text),
        "avg_word_length": sum(len(word) for word in text.split()) / len(text.split()) if len(text.split()) > 0 else 0,
        "sentence_length": len(text.split())
    }

In [None]:
sentence = "This is an example sentence to evaluate."
metrics = compute_metrics(sentence)
print(metrics)

In [None]:
sentence = "This is an example."
avg_word_length = sum(len(word) for word in sentence.split()) / len(sentence.split())
print(f"Average word length: {avg_word_length}")

In [None]:
df = pd.read_csv('data_logs/wmt14_bleu_threshold.csv')

In [None]:
df_metrics = df['input_text'].apply(compute_metrics).apply(pd.Series)
df = pd.concat([df, df_metrics], axis=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
metrics = [
    "flesch_reading_ease", "gunning_fog", "smog_index", 
    "automated_readability_index", "lexical_diversity", 
    "syllable_count", "complex_word_count", "avg_word_length", 
    "sentence_length"
]

In [None]:
can_handle_df = df[df['1b'] == 1]
cannot_handle_df = df[df['1b'] == 0]

In [None]:
for metric in metrics:
    plt.figure(figsize=(8, 6))
    sns.histplot(can_handle_df[metric], color='blue', alpha=0.5, bins=30, kde=True, label='Can Handle')
    sns.histplot(cannot_handle_df[metric], color='red', alpha=0.5, bins=30, kde=True, label='Cannot Handle')
    plt.title(f"Distribution of {metric}")
    plt.xlabel(metric)
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()

In [None]:
correlation_results = {}
for metric in metrics:
    correlation_results[metric] = df[metric].corr(df['1b'])
print("Correlations with '1b':", correlation_results)

In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

In [39]:
exclude_cols = ['input_text', '1b', '3b', '8b']

feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols].values
y = df[['1b', '3b', '8b']]

In [40]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [41]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

In [43]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [44]:
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,3)
        )

    def forward(self, x):
        return self.fc(x)

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleNet(input_dim=len(feature_cols)).to(device)

In [46]:
BCE_loss = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [47]:
num_epochs = 100

In [48]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = BCE_loss(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_dataloader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = BCE_loss(outputs, y_batch)
            val_loss += loss.item()
            
    val_loss /= len(val_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Epoch 1/100, Train Loss: 0.6813, Val Loss: 0.6640
Epoch 2/100, Train Loss: 0.6465, Val Loss: 0.6324
Epoch 3/100, Train Loss: 0.6167, Val Loss: 0.6049
Epoch 4/100, Train Loss: 0.5907, Val Loss: 0.5809
Epoch 5/100, Train Loss: 0.5682, Val Loss: 0.5604
Epoch 6/100, Train Loss: 0.5492, Val Loss: 0.5433
Epoch 7/100, Train Loss: 0.5336, Val Loss: 0.5293
Epoch 8/100, Train Loss: 0.5209, Val Loss: 0.5179
Epoch 9/100, Train Loss: 0.5107, Val Loss: 0.5088
Epoch 10/100, Train Loss: 0.5025, Val Loss: 0.5014
Epoch 11/100, Train Loss: 0.4958, Val Loss: 0.4954
Epoch 12/100, Train Loss: 0.4904, Val Loss: 0.4904
Epoch 13/100, Train Loss: 0.4859, Val Loss: 0.4863
Epoch 14/100, Train Loss: 0.4821, Val Loss: 0.4828
Epoch 15/100, Train Loss: 0.4790, Val Loss: 0.4799
Epoch 16/100, Train Loss: 0.4762, Val Loss: 0.4774
Epoch 17/100, Train Loss: 0.4739, Val Loss: 0.4752
Epoch 18/100, Train Loss: 0.4719, Val Loss: 0.4733
Epoch 19/100, Train Loss: 0.4701, Val Loss: 0.4717
Epoch 20/100, Train Loss: 0.4685, Val Lo