In [1]:
# !pip install dask
import dask
import logging
import gzip
import json

from dask import dataframe as dd
from dask.distributed import Client, progress

In [2]:
# Scale up: connect to your own cluster with bmore resources
# see http://dask.pydata.org/en/latest/setup.html
client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='12GB', silence_logs=logging.ERROR)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://192.168.100.5:8787/status,

0,1
Dashboard: http://192.168.100.5:8787/status,Workers: 1
Total threads: 4,Total memory: 11.18 GiB
Status: running,Using processes: False

0,1
Comm: inproc://192.168.100.5/21556/1,Workers: 1
Dashboard: http://192.168.100.5:8787/status,Total threads: 4
Started: Just now,Total memory: 11.18 GiB

0,1
Comm: inproc://192.168.100.5/21556/4,Total threads: 4
Dashboard: http://192.168.100.5:60902/status,Memory: 11.18 GiB
Nanny: None,
Local directory: C:\Users\W10Home\AppData\Local\Temp\dask-scratch-space\worker-_3k6ibmf,Local directory: C:\Users\W10Home\AppData\Local\Temp\dask-scratch-space\worker-_3k6ibmf


Wczytanie Subsetu Danych

In [3]:
df = dd.read_json('AMAZON_FASHION.json.gz')

In [4]:
df.head(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,
1,2,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,
2,4,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,
3,2,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,
4,3,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,
5,5,True,"07 19, 2014",A29HLOUW0NS0EH,7106116521,Patricia R. Erwin,Exactly what I wanted.,Five Stars,1405728000,,,
6,4,True,"05 31, 2014",A7QS961ROI6E0,7106116521,REBECCA S LAYTON,These little plastic backs work great. No mor...,Works great!,1401494400,,,
7,3,True,"09 22, 2013",A1BB77SEBQT8VX,B00007GDFV,Darrow H Ankrum II,mother - in - law wanted it as a present for h...,bought as a present,1379808000,,{'Color:': ' Black'},
8,3,True,"07 17, 2013",AHWOW7D1ABO9C,B00007GDFV,rosieO,"Item is of good quality. Looks great, too. But...",Buxton heiress collection,1374019200,,{'Color:': ' Black'},
9,3,True,"04 13, 2013",AKS3GULZE0HFC,B00007GDFV,M. Waltman,I had used my last el-cheapo fake leather ciga...,Top Clasp Broke Within 3 days!,1365811200,,{'Color:': ' Black'},


In [5]:
columns_to_drop = set(df.columns) - set(['reviewText', 'summary', 'overall'])
df = df.drop(columns=columns_to_drop)

Tworzenie Etykiet (Labeling)

In [6]:
df['label'] = df['overall'].map(lambda x: 1 if x >= 4 else 0) #  4 >= is positive

In [7]:
df.head(10)

Unnamed: 0,overall,reviewText,summary,label
0,5,Exactly what I needed.,perfect replacements!!,1
1,2,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",0
2,4,Love these... I am going to order another pack...,My New 'Friends' !!,1
3,2,too tiny an opening,Two Stars,0
4,3,Okay,Three Stars,0
5,5,Exactly what I wanted.,Five Stars,1
6,4,These little plastic backs work great. No mor...,Works great!,1
7,3,mother - in - law wanted it as a present for h...,bought as a present,0
8,3,"Item is of good quality. Looks great, too. But...",Buxton heiress collection,0
9,3,I had used my last el-cheapo fake leather ciga...,Top Clasp Broke Within 3 days!,0


Czyszczenie tekstu

In [8]:
import re

def clean_text(text):
    if not isinstance(text, str):
        if isinstance(text, float):
            text = str(text)
        else:
            return text
    # text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Opcjonalnie: Usuwanie pojedynczych liter (może nie być zawsze potrzebne)
    # text = re.sub(r'\s[b-zB-Z]\s', ' ', text)
    return text

In [9]:
df['cleaned_text'] = df['reviewText'].map(clean_text)

In [10]:
df.head(10)

Unnamed: 0,overall,reviewText,summary,label,cleaned_text
0,5,Exactly what I needed.,perfect replacements!!,1,Exactly what I needed
1,2,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",0,I agree with the other review the opening is t...
2,4,Love these... I am going to order another pack...,My New 'Friends' !!,1,Love these I am going to order another pack to...
3,2,too tiny an opening,Two Stars,0,too tiny an opening
4,3,Okay,Three Stars,0,Okay
5,5,Exactly what I wanted.,Five Stars,1,Exactly what I wanted
6,4,These little plastic backs work great. No mor...,Works great!,1,These little plastic backs work great No more...
7,3,mother - in - law wanted it as a present for h...,bought as a present,0,mother in law wanted it as a present for her...
8,3,"Item is of good quality. Looks great, too. But...",Buxton heiress collection,0,Item is of good quality Looks great too But it...
9,3,I had used my last el-cheapo fake leather ciga...,Top Clasp Broke Within 3 days!,0,I had used my last elcheapo fake leather cigar...


Stemming

In [11]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
df['stemmed_text'] = df['reviewText'].map(clean_text).map(lambda x: stemmer.stem(x))

In [12]:
df.head(10)

Unnamed: 0,overall,reviewText,summary,label,cleaned_text,stemmed_text
0,5,Exactly what I needed.,perfect replacements!!,1,Exactly what I needed,exactly what i need
1,2,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",0,I agree with the other review the opening is t...,i agree with the other review the opening is t...
2,4,Love these... I am going to order another pack...,My New 'Friends' !!,1,Love these I am going to order another pack to...,love these i am going to order another pack to...
3,2,too tiny an opening,Two Stars,0,too tiny an opening,too tiny an open
4,3,Okay,Three Stars,0,Okay,okay
5,5,Exactly what I wanted.,Five Stars,1,Exactly what I wanted,exactly what i w
6,4,These little plastic backs work great. No mor...,Works great!,1,These little plastic backs work great No more...,these little plastic backs work great no more...
7,3,mother - in - law wanted it as a present for h...,bought as a present,0,mother in law wanted it as a present for her...,mother in law wanted it as a present for her...
8,3,"Item is of good quality. Looks great, too. But...",Buxton heiress collection,0,Item is of good quality Looks great too But it...,item is of good quality looks great too but it...
9,3,I had used my last el-cheapo fake leather ciga...,Top Clasp Broke Within 3 days!,0,I had used my last elcheapo fake leather cigar...,i had used my last elcheapo fake leather cigar...


Wektoryzacja

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Wektoryzacja - musi być wykonana po obliczeniach Dask
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['stemmed_text'].compute())
y = df['label'].compute()

Modelowanie i Klasyfikacja

In [16]:
# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from dask_ml.model_selection import train_test_split

# Podział na zbiory treningowe i testowe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Klasyfikacja
# model = RandomForestClassifier(verbose=2)
model = LogisticRegression(max_iter=10000,verbose=2)
model.fit(X_train, y_train)

Ocena Modelu

In [19]:
# from dask_ml.metrics import accuracy_score
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.889185641211353
