In [None]:
import requests
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [None]:
URL = "http://localhost:8080/api/ml/dataset"

response = requests.get(URL)
response.raise_for_status()

data = response.json()
df = pd.DataFrame(data)

print("Dataset shape:", df.shape)

Cleaning the data:
   - text_columns --> contain the names of all the columns that contain words
   - fillna --> to fill N/A into cells that are empty, intead of leaving a blank gap
   - astype(str) --> make sure every value is treated as a string
   - df["liked"].astype(int) --> liked column marks wether user likes an artwork or not 
      - (1 liked, 0 not liked)
      
combining the columns:
   - to have the describtive features in one line rather than multipe columns, makes it easier to associate each feature with a specific artwork.

In [None]:

text_columns = [
    "artist", "period", "culture", "medium",
    "preferredArtists", "preferredStyles",
    "preferredMediums", "preferredTimePeriods",
    "preferredMovements"
]

for col in text_columns:
    df[col] = df[col].fillna("").astype(str)

df["liked"] = df["liked"].astype(int)


# Combined Text Feature
df["combined_text"] = (
    df["artist"] + " " +
    df["period"] + " " +
    df["culture"] + " " +
    df["medium"] + " " +
    df["preferredArtists"] + " " +
    df["preferredStyles"] + " " +
    df["preferredMediums"] + " " +
    df["preferredTimePeriods"] + " " +
    df["preferredMovements"]
)

X_text = df["combined_text"]
y = df["liked"]

splitting data 
    - training data 80%
    - testing data 20% 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_text, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


TF-IDF Vectorization - turning words into numbers 

In [None]:
vectorizer = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 2)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Training Classifier using logisitic regression:
    - checks the features and finds patterns

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

#testing the trained model
y_pred = model.predict(X_test_tfidf)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

- artwork_recommender :
    - trained model
- vectorizer: 
    - word to number translator 

In [None]:
joblib.dump(model, "artwork_recommender.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")