In [4]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from scipy.stats import uniform

# for text pre-processing
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

wordnet_lemmatizer = WordNetLemmatizer()
string.punctuation
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')

# for model-building
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# visualizers
import seaborn as sns
import matplotlib.pyplot as plt 

#Change the path to input data to which ever folder contains test and datasets.
trainpath = r"C:\Users\James\machinelearning\Datasets\20news-bydate-txt-true\20news-bydate-train"
testpath = r"C:\Users\James\machinelearning\Datasets\20news-bydate-txt-true\20news-bydate-test"

os.chdir(trainpath)

rawtrain = []
rawtest = []

directory_contents = os.listdir(trainpath)
print(directory_contents)

ModuleNotFoundError: No module named 'pandas'

In [None]:
def pullTrainSet():
    for foldername in directory_contents:
        currentdirectory = trainpath + r"\\" + foldername
        os.chdir(currentdirectory)
        for filename in os.listdir(os.getcwd()):
            with open(os.path.join(os.getcwd(), filename), 'r') as f: # open in readonly mode
                categoryandtexts = []
                categoryandtexts.append(foldername)
                categoryandtexts.append(f.read())
                rawtrain.append(categoryandtexts)

In [None]:
def pullTestSet():
    for foldername in directory_contents:
        currentdirectory = testpath + r"\\" + foldername
        os.chdir(currentdirectory)
        for filename in os.listdir(os.getcwd()):
            with open(os.path.join(os.getcwd(), filename), 'r') as f: # open in readonly mode
                categoryandtexts = []
                categoryandtexts.append(foldername)
                categoryandtexts.append(f.read())
                rawtest.append(categoryandtexts)

In [None]:
pullTrainSet()
pullTestSet()
random.shuffle(rawtrain)
random.shuffle(rawtest)

In [None]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#lower capitalization (built in)

def tokenization(text):
    tokens = text.split()
    return tokens

def nltk_tokenization(text):
    token_text = nltk.word_tokenize(text)
    return token_text
    
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

def processDataframe(df):
    df['clean_msg'] = df['Text'].apply(lambda x: remove_punctuation(x))
    df['msg_lower'] = df['clean_msg'].apply(lambda x: x.lower())
    df['tokenized'] = df['msg_lower'].apply(lambda x: tokenization(x))
    df['nltk_tokenized'] = df['msg_lower'].apply(lambda x: nltk_tokenization(x))
    df['no_stopwords'] = df['nltk_tokenized'].apply(lambda x:remove_stopwords(x))
    df['Post-processed Text'] = df['no_stopwords'].apply(lambda x:lemmatizer(x))
    droppedDF = df.drop(["Text", "clean_msg", "msg_lower", "msg_lower", "tokenized", "nltk_tokenized", "no_stopwords"], axis=1)
    return droppedDF

In [None]:
trainDF = pd.DataFrame(rawtrain, columns=['Category','Text'])
testDF = pd.DataFrame(rawtest, columns=['Category','Text'])

In [None]:
vectorizer = TfidfVectorizer()
tfid_vect_train_vectors = vectorizer.fit_transform(trainDF['Text'])
tfid_vect_test_vectors = vectorizer.transform(testDF['Text'])

mNB_pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
    
])

lr_pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('lr', LogisticRegression()),
    
])

vectorizer.get_feature_names_out()