In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import numpy as np

## Load Data

In [2]:
path = 'scale_data/scaledata'
list_dir = os.listdir(path)
list_dir

['Dennis+Schwartz', 'James+Berardinelli', 'Scott+Renshaw', 'Steve+Rhodes']

In [3]:
reviews_list = list() ## A list of all reviews
for direc in list_dir:
    file = os.path.join(path, direc, "subj." + direc) 
    reviews_file = open(file)
    reviews_file_contents = reviews_file.read()
    reviews_list.extend(reviews_file_contents.splitlines())
    reviews_file.close()

In [4]:
ratings_list = list() ## A list of all ratings
for direc in list_dir:
    file = os.path.join(path, direc, "rating." + direc) 
    ratings_file = open(file)
    ratings_file_contents = ratings_file.read()
    ratings_list.extend(ratings_file_contents.splitlines())
    ratings_file.close()

## Train-Test Split

In [5]:
full_reviews_list_train, full_reviews_list_test, full_ratings_list_train, ratings_list_test = train_test_split(reviews_list, ratings_list, test_size=0.30, random_state=42)

## Train-Validation Split

In [6]:
reviews_list_train, reviews_list_val, ratings_list_train, ratings_list_val = train_test_split(full_reviews_list_train, full_ratings_list_train, test_size=0.25, random_state=1)

## Transform Data

In [7]:
def tfidf_train(newsgroups_train, n_features):
    """
    Train a TFIDF vectorizer and compute the TFIDF representation of the train data.

    Args:
        newsgroups_train (ndarray): corpus of all documents from all categories in train set
        n_features (int): vocabulary size
    Returns:
        vectorizer_train (object): trained tfidf vectorizer
        feature_names_train (list): list of features extracted from the trained tfidf vectorizer
        X_train (ndarray): tfidf word-document matrix of train data

    """
    # Extract Tfidf weights
    stop_words_list = nltk.corpus.stopwords.words('english')
    vectorizer_train = TfidfVectorizer(max_features=n_features,
                                    min_df=5, max_df=0.70,
                                    token_pattern = '[a-zA-Z]+',
                                    stop_words = stop_words_list)
    vectors_train = vectorizer_train.fit_transform(newsgroups_train)
    feature_names_train = vectorizer_train.get_feature_names() #features list
    dense_train = vectors_train.todense()

    denselist_train = np.array(dense_train).transpose() # tfidf matrix
    X_train = denselist_train.copy() # train data (tfidf)

    return vectorizer_train, feature_names_train, X_train

In [8]:
def tfidf_transform(vectorizer_train, newsgroups_test):
    """
    Apply TFIDF transformation to test data.

    Args:
        vectorizer_train (object): trained tfidf vectorizer
        newsgroups_test (ndarray): corpus of all documents from all categories in test set
    Returns:
        X_test (ndarray): tfidf word-document matrix of test data
    """

    vectors_test = vectorizer_train.transform(newsgroups_test)
    dense_test = vectors_test.todense()
    denselist_test = np.array(dense_test).transpose()
    X_test = denselist_test.copy()

    return X_test

In [9]:
def shuffle_data(X,y):
    """
    Shuffle data X, labels y

    Args/Returns:
        X (ndarray): data matrix, shape (vocabulary, documents)
        y (ndarray): labels, shape (documents,)
    """
    data = np.row_stack((X, y))
    np.random.shuffle(data.T)
    X = data[:-1,:]
    y = data[-1,:]

    return X, y

In [10]:
# Compute the TFIDF representation of the train set
vectorizer_train, feature_names_train, X_train = tfidf_train(reviews_list_train, n_features = 5000)

In [11]:
train_ratings = np.array(ratings_list_train)
X_train, train_ratings = shuffle_data(X_train, train_ratings)

In [12]:
# Apply TFIDF transformation to validation set
X_val = tfidf_transform(vectorizer_train, reviews_list_val)
val_ratings = np.array(ratings_list_val)
X_val, val_ratings = shuffle_data(X_val, val_ratings)

In [13]:
# Compute the TFIDF representation of the full train set
vectorizer_train_full, feature_names_train_full, X_train_full = tfidf_train(full_reviews_list_train, n_features = 5000)
full_train_ratings = np.array(full_ratings_list_train)
X_train_full, full_train_ratings = shuffle_data(full_reviews_list_train, full_train_ratings)

In [14]:
# Apply TFIDF transformation to test data set
X_test = tfidf_transform(vectorizer_train_full, full_reviews_list_test)
test_ratings = np.array(ratings_list_test)
X_test, test_ratings = shuffle_data(X_test, test_ratings)

In [15]:
## Convert rating 1-d rating arrays to 2-D to use in SSNMF
train_ratings = train_ratings[np.newaxis, :]
val_ratings = val_ratings[np.newaxis, :]
full_train_ratings = full_train_ratings[np.newaxis, :]
test_ratings = test_ratings[np.newaxis, :]

In [16]:
X_train.shape

(5000, 2628)

In [17]:
train_ratings.shape

(1, 2628)