In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/books.csv", on_bad_lines = "skip")
df = df.drop('bookID', axis=1)
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.strip()

In [None]:
df.columns

In [None]:
df['publication_date'].head()

In [None]:
df['publication_date'] = pd.to_datetime(df['publication_date'],errors='coerce')

In [None]:
df.info()

In [None]:
df[df['publication_date'].isnull()]

In [None]:
df['publication_year'] = df['publication_date'].dt.year

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df["language_code"].value_counts()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_style('whitegrid')

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['average_rating'], bins=30, kde=10)
plt.title('Distribution of Average Book Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
print(f"Mean rating : {df['average_rating'].mean()}")
print(f"Median rating : {df['average_rating'].median()}")

In [None]:
df['is_highly_rated'] = (df['average_rating'] >= df['average_rating'].median()).astype(int)

In [None]:
df[['title', 'average_rating', 'is_highly_rated']].head(20)

In [None]:
features =  ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year', 'language_code']
target = 'is_highly_rated'

X = df[features]
y = df[target]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
numerical_features = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']
categorical_features = ['language_code']

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown = 'ignore'), categorical_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
print("Shape of processed training data:", X_train_processed.shape)
print("Shape of processed test data:", X_test_processed.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42)
model.fit(X_train_processed, y_train)

print("Model training complete!")

In [None]:
y_pred = model.predict(X_test_processed)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
print(classification_report(y_test, y_pred))