In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
jokes_df = pd.read_csv('/kaggle/input/jester-17m-jokes-ratings-dataset/jester_items.csv')
ratings_df = pd.read_csv('/kaggle/input/jester-17m-jokes-ratings-dataset/jester_ratings.csv')
unique_categories = jokes_df['jokeId'].unique()

print(len(unique_categories))
print(len(jokes_df))
print(len(ratings_df))

print(jokes_df.head())
print(ratings_df.head())

In [None]:
most_common_user_id = ratings_df['userId'].value_counts().idxmax()
print(most_common_user_id)

In [None]:
ratings_df = ratings_df.loc[ratings_df['userId'] == most_common_user_id]

dataset_df = pd.merge(jokes_df, ratings_df, on='jokeId', how='inner')
#dataset_df = dataset_df[ ['jokeText','rating','jokeId'] ]
unique= dataset_df['jokeId'].unique()
print(len(unique))
print(dataset_df.head())
print(len(dataset_df))

avg_rating_df = dataset_df

In [None]:
dataset_df = data
average_ratings = dataset_df.groupby('jokeId')['rating'].mean()
print(len(average_ratings))
average_ratings.head()

In [None]:
#avg_rating_df = pd.DataFrame({'jokeId': average_ratings.index, 'avg_rating': average_ratings.values})
avg_rating_df.head()

In [None]:
import matplotlib.pyplot as plt
plt.hist(avg_rating_df['rating'], bins=10, edgecolor='black')

plt.title('Histogram of Average Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')

plt.show()

In [None]:
original_min = avg_rating_df['rating'].min()
original_max = avg_rating_df['rating'].max()

print(original_min,original_max)

In [None]:
target_min = 0
target_max = 10

# Apply the linear mapping formula
avg_rating_df['rating'] = ((avg_rating_df['rating'] - original_min) / (original_max - original_min)) * (target_max - target_min) + target_min


In [None]:
import matplotlib.pyplot as plt
plt.hist(avg_rating_df['rating'], bins=10, edgecolor='black')

plt.title('Histogram of Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')

plt.show()

In [None]:
#avg_rating_df = pd.merge(avg_rating_df, jokes_df, on='jokeId', how='inner')
#print(len(avg_rating_df))
avg_rating_df.head()



In [None]:
avg_rating_df['rating'] = avg_rating_df['rating'].round().astype(int)

In [None]:
avg_rating_df.head()

In [None]:
import matplotlib.pyplot as plt
plt.hist(avg_rating_df['rating'], bins=10, edgecolor='black')

plt.title('Histogram of Average Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')

plt.show()

In [None]:

# Filter jokes with an average rating of 10
high_avg_rating_jokes = avg_rating_df[avg_rating_df['avg_rating'] == 10]

# Calculate the number of jokes to delete (25% of high-average-rating jokes)
num_jokes_to_delete = int(len(high_avg_rating_jokes) * 0.35)

# Randomly select jokes to delete
jokes_to_delete = high_avg_rating_jokes.sample(n=num_jokes_to_delete, random_state=42)

# Remove the selected jokes from the DataFrame
avg_rating_df = avg_rating_df.drop(jokes_to_delete.index)


In [None]:
import matplotlib.pyplot as plt
plt.hist(avg_rating_df['avg_rating'], bins=10, edgecolor='black')

plt.title('Histogram of Average Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')

plt.show()

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

In [None]:
train_data, val_data = train_test_split(avg_rating_df, test_size=0.2, random_state=42)

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=11)  # 11 classes (0 to 10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



In [None]:
from torch.nn.utils.rnn import pad_sequence  # Import pad_sequence

def tokenize_and_pad(texts):
    tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in texts]
    padded_texts = pad_sequence([torch.tensor(tokens) for tokens in tokenized_texts], batch_first=True)
    return padded_texts




In [None]:
train_inputs = tokenize_and_pad(train_data['jokeText'])
train_labels = torch.tensor(train_data['rating'], dtype=torch.long)
train_dataset = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
# Define optimizer and loss function
optimizer = AdamW(model.classifier.parameters(), lr=1e-5, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()


In [None]:
num_epochs = 50


In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        optimizer.zero_grad()
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)  # Move inputs and labels to GPU
        outputs = model(inputs)[0]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}')


In [None]:
val_data.head()

In [None]:
def get_joke_class(joke_text):
    # Tokenize and prepare the input for the model
    tokenized_text = tokenizer.encode(joke_text, add_special_tokens=True)
    padded_text = pad_sequence([torch.tensor(tokenized_text)], batch_first=True).to(device)
    model.eval()
    # Get the predicted class
    with torch.no_grad():
        outputs = model(padded_text)[0]
        predicted_class = torch.argmax(outputs).item()
    
    return predicted_class

# Example usage
joke = " He'll stop at nothing to avoid them."
predicted_class = get_joke_class(joke)
print(f'Predicted Class: {predicted_class}')

In [None]:
from sklearn.metrics import f1_score

true_classes = val_data['rating'].tolist()
predicted_classes = [get_joke_class(joke_text) for joke_text in val_data['jokeText']]
f1 = f1_score(true_classes, predicted_classes, average='macro')

In [None]:
f1

In [None]:
model.save_pretrained('trained_classification_model')