# 2019320016 차주한

### Dataset load function

In [2]:
import pandas as pd

def load_data(path, remove_duplicate):
    data = pd.read_csv(path)
    del data["id"]

    print(f"데이터 수: {len(data)}")

    if remove_duplicate:
        data.drop_duplicates(subset=["mail"], inplace=True)
        print(f"중복 제거 후 데이터 수: {len(data)}")

    print(data[:5])
    print()

    return data

### Tokenizer

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("treebank")

tb_tokenizer = TreebankWordTokenizer()
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def tokenizer(text):
    tokens = tb_tokenizer.tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hanchaa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanchaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hanchaa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\hanchaa\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


### Split train and validation set

In [4]:
from sklearn.model_selection import train_test_split

data = load_data("./data/train.csv", True)

X = data["mail"]
y = data["label"]

X_train_temp, X_val_temp, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

print("train data")
print(f"non spam: {round(y_train.value_counts()[0]/len(y_train) * 100, 3)}%")
print(f"spam: {round(y_train.value_counts()[1]/len(y_train) * 100, 3)}%")

print()

print("validation data")
print(f"non spam: {round(y_val.value_counts()[0]/len(y_val) * 100, 3)}%")
print(f"non spam: {round(y_val.value_counts()[1]/len(y_val) * 100, 3)}%")

데이터 수: 3620
중복 제거 후 데이터 수: 3528
   label                                               mail
0      0  Subject: update on prc process\r\nplease read ...
1      1  Subject: via - gra pro will get you hard consu...
2      0  Subject: re : southern\r\ndarren :\r\ni zeroed...
3      0  Subject: hpl nom for sept . 22 , 2000\r\n( see...
4      1  Subject: perform your best in bed\r\ngeneric c...

train data
non spam: 71.297%
spam: 28.703%

validation data
non spam: 71.246%
non spam: 28.754%


### Preprocess data

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=tokenizer)
X_train_vectorized = vectorizer.fit_transform(X_train_temp)
print("subject dtm shape: " + str(X_train_vectorized.shape))

subject dtm shape: (2822, 33572)


### Train naive bayse model

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np

# subject_mod = MultinomialNB(alpha=0.1)
# subject_mod.fit(X_train_vectorized["subject"], y_train)
#
# body_mod = MultinomialNB(alpha=1)
# body_mod.fit(X_train_vectorized["body"], y_train)
#
# p = 0.6
# threshold = 0.5
#
# predicted_with_subject = subject_mod.predict_proba(X_train_vectorized["subject"])
# predicted_with_body = body_mod.predict_proba(X_train_vectorized["body"])
#
# predicted = np.where((p * predicted_with_subject + (1 - p) * predicted_with_body)[:, 1] > threshold, 1, 0)

model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

predicted = model.predict(X_train_vectorized)

print(accuracy_score(y_train, predicted))


0.9932671863926293


### Test on validation set

In [11]:
# X_val_dtm = {"subject": subject_vectorizer.transform(X_val["subject"]), "body": body_vectorizer.transform(X_val["body"])}
#
# predicted_with_subject = subject_mod.predict_proba(X_val_dtm["subject"])
# predicted_with_body = body_mod.predict_proba(X_val_dtm["body"])
#
# predicted = np.where((p * predicted_with_subject + (1 - p) * predicted_with_body)[:, 1] > threshold, 1, 0)

X_val_vectorized = vectorizer.transform(X_val_temp)
predicted = model.predict(X_val_vectorized)

print(accuracy_score(y_val, predicted))

0.9759206798866855


### Predict on test set

In [12]:
import csv

test_data = load_data("./data/test.csv", False)

X_test_temp = test_data["mail"]

# X_test = {}
# X_test["subject"], X_test["body"] = split_mail(X_test_temp)
#
# X_test_dtm = {"subject": subject_vectorizer.transform(X_test["subject"]), "body": body_vectorizer.transform(X_test["body"])}
#
# predicted_with_subject = subject_mod.predict_proba(X_test_dtm["subject"])
# predicted_with_body = body_mod.predict_proba(X_test_dtm["body"])
#
# predicted = np.where((p * predicted_with_subject + (1 - p) * predicted_with_body)[:, 1] > threshold, 1, 0)
X_test_vectorized = vectorizer.transform(X_test_temp)
predicted = model.predict(X_test_vectorized)

print(predicted)

f = open("result_NB.csv", "w", newline="")
wr = csv.writer(f)
wr.writerow(["id", "label"])

id = 0
for i in predicted:
    wr.writerow([id, i])
    id += 1

f.close()

데이터 수: 1551
                                                mail
0  Subject: re : coastal ctr # 96008903 meter 098...
1  Subject: re :\r\nfyi\r\n- - - - - - - - - - - ...
2  Subject: hpl nomination changes for july 25 an...
3  Subject: new stack manager\r\nthere is a new v...
4  Subject: deletion of your enrononline user id\...

[0 0 0 ... 1 0 0]
