# 2019320016 차주한

### Function that loads dataset

In [90]:
import pandas as pd

def load_data(path, remove_duplicate):
    data = pd.read_csv(path)
    del data["id"]

    print(f"데이터 수: {len(data)}")

    if remove_duplicate:
        data.drop_duplicates(subset=["mail"], inplace=True)
        print(f"중복 제거 후 데이터 수: {len(data)}")

    print(data[:5])
    print()

    return data

### Function that splits mail into subject and body

In [50]:
def split_mail(X):
    subject = []
    body = []

    for mail in X:
        mail = mail.split("Subject: ", 1)[1]
        subject_mail = mail.split("\r\n", 1)

        subject.append(subject_mail[0])
        body.append(subject_mail[1])

    return subject, body


### Split train and validation set

In [51]:
from sklearn.model_selection import train_test_split

data = load_data("./data/train.csv", True)

X = data["mail"]
y = data["label"]

X_train_temp, X_val_temp, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

X_train = {}
X_train["subject"], X_train["body"] = split_mail(X_train_temp)

X_val = {}
X_val["subject"], X_val["body"] = split_mail(X_val_temp)

print("train data")
print(f"non spam: {round(y_train.value_counts()[0]/len(y_train) * 100, 3)}%")
print(f"spam: {round(y_train.value_counts()[1]/len(y_train) * 100, 3)}%")

print()

print("validation data")
print(f"non spam: {round(y_val.value_counts()[0]/len(y_val) * 100, 3)}%")
print(f"non spam: {round(y_val.value_counts()[1]/len(y_val) * 100, 3)}%")

데이터 수: 3620
   label                                               mail
0      0  Subject: update on prc process\r\nplease read ...
1      1  Subject: via - gra pro will get you hard consu...
2      0  Subject: re : southern\r\ndarren :\r\ni zeroed...
3      0  Subject: hpl nom for sept . 22 , 2000\r\n( see...
4      1  Subject: perform your best in bed\r\ngeneric c...
중복 제거 후 데이터 수: 3528

train data
non spam: 71.297%
spam: 28.703%

validation data
non spam: 71.246%
non spam: 28.754%


### Preprocess data

In [65]:
from sklearn.feature_extraction.text import CountVectorizer

subject_vectorizer = CountVectorizer(stop_words="english")
body_vectorizer = CountVectorizer(stop_words="english")
X_train_dtm = {"subject": subject_vectorizer.fit_transform(X_train["subject"]), "body": body_vectorizer.fit_transform(X_train["body"])}

print("subject dtm shape: " + str(X_train_dtm["subject"].shape))
print("body dtm shape: " + str(X_train_dtm["body"].shape))

subject dtm shape: (2822, 3346)
body dtm shape: (2822, 34584)


### Train naive bayse model

In [123]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np

subject_mod = MultinomialNB(alpha=0.1)
subject_mod.fit(X_train_dtm["subject"], y_train)

body_mod = MultinomialNB(alpha=1)
body_mod.fit(X_train_dtm["body"], y_train)

p = 0.5

predicted_with_subject = subject_mod.predict_proba(X_train_dtm["subject"])
predicted_with_body = body_mod.predict_proba(X_train_dtm["body"])

predicted = np.where((p * predicted_with_subject + (1 - p) * predicted_with_body)[:, 1] > 0.5, 1, 0)

print(accuracy_score(y_train, predicted))


0.9961020552799433


### Test on validation set

In [124]:
X_val_dtm = {"subject": subject_vectorizer.transform(X_val["subject"]), "body": body_vectorizer.transform(X_val["body"])}

predicted_with_subject = subject_mod.predict_proba(X_val_dtm["subject"])
predicted_with_body = body_mod.predict_proba(X_val_dtm["body"])

predicted = np.where((p * predicted_with_subject + (1 - p) * predicted_with_body)[:, 1] > 0.5, 1, 0)

print(accuracy_score(y_val, predicted))

0.9801699716713881


### Predict on test set

In [110]:
import csv

test_data = load_data("./data/test.csv", False)

X_test_temp = test_data["mail"]

X_test = {}
X_test["subject"], X_test["body"] = split_mail(X_test_temp)

X_test_dtm = {"subject": subject_vectorizer.transform(X_test["subject"]), "body": body_vectorizer.transform(X_test["body"])}

predicted_with_subject = subject_mod.predict_proba(X_test_dtm["subject"])
predicted_with_body = body_mod.predict_proba(X_test_dtm["body"])

predicted = np.where((p * predicted_with_subject + (1 - p) * predicted_with_body)[:, 1] > 0.5, 1, 0)
print(predicted)

f = open("result_NB.csv", "w", newline="")
wr = csv.writer(f)
wr.writerow(["id", "label"])

id = 0
for i in predicted:
    wr.writerow([id, i])
    id += 1

f.close()

데이터 수: 1551
                                                mail
0  Subject: re : coastal ctr # 96008903 meter 098...
1  Subject: re :\r\nfyi\r\n- - - - - - - - - - - ...
2  Subject: hpl nomination changes for july 25 an...
3  Subject: new stack manager\r\nthere is a new v...
4  Subject: deletion of your enrononline user id\...

[0 0 0 ... 1 0 0]
