In [1]:
import os
import pandas as pd
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
from nltk.stem.snowball import  EnglishStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Error loading punkt: <urlopen error Tunnel connection
[nltk_data]     failed: 502 Fiddler - DNS Lookup Failed>
[nltk_data] Error loading stopwords: <urlopen error Tunnel connection
[nltk_data]     failed: 502 Fiddler - DNS Lookup Failed>
[nltk_data] Error loading wordnet: <urlopen error Tunnel connection
[nltk_data]     failed: 502 Fiddler - DNS Lookup Failed>


In [2]:
_stop_words = set(stopwords.words('english') + list(punctuation))

In [3]:
tokenizer = TreebankWordTokenizer() 
detokenizer = TreebankWordDetokenizer()

In [4]:
stemmer = EnglishStemmer()


In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
_word_list = set([word for word in wordnet.words(lang='eng')])


In [7]:
def read_files(dir):
    allfiles = []
    for roots, dirs, files in os.walk(dir):
        for file in files:
            allfiles.append(os.path.join(roots, file))
    return allfiles


In [8]:
def file_loca(file_paths):
    comp = []
    rec = []
    sci = []
    talk = []
    
    for path in file_paths:
        if "comp" in path:
            w = open(path, encoding='utf-8', errors='ignore')
            comp += [w.read()]
        elif "rec" in path:
            w = open(path, encoding='utf-8', errors='ignore')
            rec += [w.read()]
        elif "sci" in path:
            w = open(path, encoding='utf-8', errors='ignore')
            sci += [w.read()]
        elif "talk" in path:
            w = open(path, encoding='utf-8', errors='ignore')
            talk += [w.read()]
    return comp, rec, sci, talk

In [9]:
def data_unique_words(data_list):
    word_set = []
    for docs in data_list:
        for doc in docs:
            words = []
            for word in word_tokenize(doc):
                word = stemmer.stem(word)
                word = lemmatizer.lemmatize(word)
                words += [word]
            word_set += [word for word in words if word not in _stop_words if word in _word_list if word.isalpha()]
    return list(set(word_set))

In [10]:
def clean_data(data):
    cleaned_data = []
    for text in data:
        words = []
        for word in tokenizer.tokenize(text):
            word = stemmer.stem(word)
            word = lemmatizer.lemmatize(word)
            words += [word]   
        cleaned_data += [detokenizer.detokenize([word for word in words if word in _word_list if word.isalpha()])]
    return cleaned_data

In [11]:
def make_vectorize_csv(data, name, vocabulary_set):
    cleaned_data = clean_data(data)
    vectorizer = CountVectorizer(stop_words=_stop_words, vocabulary=vocabulary_set)
    x = vectorizer.fit_transform(cleaned_data)
    y = vectorizer.get_feature_names()
    
    df = pd.DataFrame(data=x.toarray(), columns=y)
    
    df.to_csv(name+'.csv')
    
    print(name+'.csv successfully created!')

In [12]:

files = read_files('./')

comp_data = []
rec_data = []
sci_data = []
talk_data = []

comp_data, rec_data, sci_data, talk_data = file_loca(files)

In [13]:
comp_train, comp_test = train_test_split(comp_data, train_size=0.7, test_size=0.3, random_state=42)
rec_train, rec_test = train_test_split(rec_data, train_size=0.7, test_size=0.3, random_state=42)
sci_train, sci_test = train_test_split(sci_data, train_size=0.7, test_size=0.3, random_state=42)
talk_train, talk_test = train_test_split(talk_data, train_size=0.7, test_size=0.3, random_state=42)


In [14]:
train_data = [comp_train, rec_train, sci_train, talk_train]
test_data = [comp_test, rec_test, sci_test, talk_test]

In [None]:
data_diction = data_unique_words(train_data)
data_diction.sort()

In [None]:
make_vectorize_csv(comp_train, 'computer_train', data_diction)
make_vectorize_csv(comp_test, 'computer_test', data_diction)

make_vectorize_csv(rec_train, 'recreational_train', data_diction)
make_vectorize_csv(rec_test, 'recreational_test', data_diction)

make_vectorize_csv(sci_train, 'science_train', data_diction)
make_vectorize_csv(sci_test, 'science_test', data_diction)

make_vectorize_csv(talk_train, 'talk_train', data_diction)
make_vectorize_csv(talk_test, 'talk_test', data_diction)