In [2]:
import csv
import gensim
from collections import defaultdict
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

csv.field_size_limit(2147483647)
stemmer = gensim.parsing.porter.PorterStemmer()

In [3]:
def customize_split(sentence, delimiter, start):
    sentence = sentence.split(delimiter)
    if len(sentence) == 1:
        return sentence[0]
    else:
        return ' '.join(sentence[start:])

In [15]:
def predict():
    print("Predicting...")
    predicted = clf.predict(X_test_tfidf)
    with open('result/sklearn_version.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["doc_id", "class_id"])
        for i in range(len(predicted)):
            writer.writerow([str(i), predicted[i]])
    print("done")

In [8]:
filters = []
with open("data/filters.txt", "r") as f:
    filters = f.read().split('\n')

stopwords = []
with open("data/stop.txt", "r") as f:
    stopwords = f.read().split('\n')


In [10]:
categories = []
with open('data/groups.csv', 'r', encoding='UTF-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        name = row[1]
        categories.append(name)
num_category = len(categories)

In [12]:
test = []
with open('data/doc.csv', 'r', encoding='UTF-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        d = row[1]
        test.append(d)

In [13]:
def preprocessing(documents):
    ok = []
    for document in documents:
        document = document.lower()
        for p in [("\\n\\n",1), ("writes:",-1), ("wrote:",-1)]:
            document = customize_split(document, p[0], p[1])
        document = document.replace("\\n"," ").replace("\t"," ").replace("-"," ")
        document = ' '.join(document.split())
        for f in filters:
            document = document.replace(f,'')
        words = document.split()
        clean_words = []
        for word in words:
            if word not in stopwords and len(word)<=11 and len(word)>=3:
                word = stemmer.stem_sentence(word)
                if word not in clean_words:
                    clean_words.append(word)
        clean_words = ' '.join(clean_words)
        ok.append(clean_words)
    return ok

In [14]:
print("Training...")
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=0)
p = preprocessing(train.data)
print(p[0])
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(p)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB(alpha=0.05).fit(X_train_tfidf, train.target)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Training...
david davidian subject account anti armenian human right violat azerbaijan summari prelud current event nagorno karabakh sdpa center region studi line ask what go sai matter cant see theyv overturn car theyr kill deposit zaven badasian born emploi sumgait bulk yarn plant resid build apart februari wife went baku shop return around five even ran on rel bu station got talk lot peopl gather far awai near store well first didnt know happen fellow come azerbaijani gui stand home immedi help catch cab safe sat two dai time gang bandit came courtyard neighbor wouldnt let stick piec armatur hand shout someth couldnt understand wasnt voic choru turn toward third _floor break glass throw thing window entrywai pair jean anoth tape record guitar auto part save midnight march hide school famili altogeth known ernest move kirovabad guard want nowher els plead told would attack upstair classroom second floor citi radio announc three telephon number could us summon assist commun anyth impo

In [16]:
print("Testing...")
p = preprocessing(test)
print(p[0])
X_test_counts = count_vect.transform(p)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
predict()

Testing...
sure basher pen fan pretti confus lack kind post recent massacr devil actual bit puzzl reliev howev go put end non relief prais man kill wors thought jagr show much better regular season stat also lot fun watch playoff bowman let next coupl game sinc beat pulp jersei anywai see island lose final rule
Predicting...
done
