In [None]:
!pip install "bokeh>=3.1.0" dask dask[complete] dask-ml scikit-learn nltk lightgbm pyngrok --quiet

In [None]:
import os
import dask.dataframe as dd
import dask
from dask_ml.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import pandas as pd
from dask_ml.metrics import accuracy_score
import numpy as np
from pyngrok import ngrok
import time
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from lightgbm.dask import DaskLGBMClassifier
from dask.distributed import Client, LocalCluster

nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def download_and_extract(dataset, file_name):
    download_path = "./downloads"
    zip_file_path = file_name + ".zip"
    command = f"kaggle datasets download {dataset} --file {file_name}"
    os.system(command)

    os.system(f"unzip {zip_file_path} -d {download_path}")
    extracted_file_path = os.path.join(download_path, file_name)

    return os.path.join(download_path, file_name)

def clean_and_label(df_chunk):
    df_chunk["review_body"] = (
        df_chunk["review_body"]
        .fillna("")
        .str.lower()
        .str.replace(r"http\S+|www\S+|https\S+", "", regex=True)
        .str.replace(r"[^a-zA-Z\s]", "", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    df_chunk["star_rating"] = pd.to_numeric(df_chunk["star_rating"], errors="coerce")
    df_chunk["sentiment"] = df_chunk["star_rating"].apply(lambda x: 1 if x > 3 else 0)
    return df_chunk

def tokenize_and_filter(df_chunk):
    df_chunk["tokens"] = df_chunk["review_body"].fillna("").apply(
        lambda text: [
            word for word in word_tokenize(text)
            if word.isalpha() and word not in stop_words and len(word) > 2
        ]
    )
    return df_chunk

def apply_tfidf(df_chunk, num_features):
    df_chunk["joined_tokens"] = df_chunk["tokens"].apply(lambda x: " ".join(x))

    vectorizer = TfidfVectorizer(max_features=num_features)
    tfidf_matrix = vectorizer.fit_transform(df_chunk["joined_tokens"])

    features = tfidf_matrix.toarray()
    feature_df = pd.DataFrame(features, columns=[f"feature_{i}" for i in range(features.shape[1])])
    feature_df["sentiment"] = df_chunk["sentiment"].values

    return feature_df

In [None]:
#!ngrok authtoken xxxxxxxxxxxx

client = Client(n_workers=4, threads_per_worker=2)
print(client)


#public_url = ngrok.connect(8787)
#print(f"Dask Dashboard: {public_url}")

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.diskutils:Found stale lock file and directory '/tmp/dask-scratch-space/scheduler-dwmqpkgy', purging
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:39503
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:43547'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:45849'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:34283'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:46647'
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:46401 name: 0
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:46401
INFO:distributed.core:Starting established con

<Client: 'tcp://127.0.0.1:39503' processes=4 threads=8, memory=334.56 GiB>


In [None]:
dataset = "cynthiarempel/amazon-us-customer-reviews-dataset"
file_name = "amazon_reviews_us_Wireless_v1_00.tsv"

file_path = download_and_extract(dataset, file_name)

ddf = dd.read_csv(
    file_path,
    sep="\t",
    dtype={"star_rating": "float32"},
    on_bad_lines="skip",
    engine="python",

)

In [None]:
ddf = ddf.sample(frac=0.5, random_state=42)

In [None]:
len(ddf)

898795

In [None]:
ddf = ddf.repartition(npartitions=4)

In [None]:
start_time = time.time()
ddf_clean = ddf.map_partitions(clean_and_label)
ddf_tokenized = ddf_clean.map_partitions(tokenize_and_filter)
tfidf_dask = ddf_tokenized.map_partitions(
    apply_tfidf,
    300,
    meta=pd.DataFrame(columns=[f"feature_{i}" for i in range(300)] + ["sentiment"])
)
final_df = tfidf_dask.compute()
end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds")


Total time taken: 95.95686388015747 seconds


In [None]:
tfidf_dask = tfidf_dask.persist()
train_ddf, test_ddf = train_test_split(
    tfidf_dask, test_size=0.2, random_state=42, shuffle=True
)
train_ddf, test_ddf = train_ddf.persist(), test_ddf.persist()

In [None]:
target_column = "sentiment"
feature_columns = [col for col in tfidf_dask.columns if col != target_column]

X_train = train_ddf[feature_columns].to_dask_array(lengths=True)
y_train = train_ddf[target_column].to_dask_array(lengths=True)
X_test = test_ddf[feature_columns].to_dask_array(lengths=True)
y_test = test_ddf[target_column].to_dask_array(lengths=True)

In [None]:
lgb_params = {
    "objective": "binary",
    "metric": ["auc"],
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.1,
    "random_state": 42,
}

start_time = time.time()
dask_model = DaskLGBMClassifier(**lgb_params)
dask_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    feature_name=feature_columns
)
end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds")



Finding random open ports for workers
Total time taken: 15.492602825164795 seconds


In [None]:
feature_names = dask_model.feature_name_
print(feature_names)

['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71', '

In [None]:
pred_probs = dask_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test.compute(), pred_probs.compute())
print(f"Validation AUC for Dask LightGBM: {auc_score:.4f}")

preds = dask_model.predict(X_test)
accuracy = (preds == y_test).mean().compute()
print(f"Validation Accuracy: {accuracy:.4f}")