In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import Datasets

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
d1 = loadCSV('processed_1')
d2 = loadCSV('processed_2')

# 4 Scenarios

### 1) Train and test classifiers on data from only dataset 1
### 2) Train and test classifiers on data from only dataset 2
### 3) Train classifiers on data from dataset 1 and test them on dataset 2
### 4) Train classifiers on data from dataset 2 and test them on dataset 1

# Split Tweets into Training and Testing Set

In [4]:
X1_train, X1_test, y1_train, y1_test = train_test_split(d1['tweet'].tolist(),
                                                        d1['class'].tolist(),
                                                        test_size = 0.2,
                                                        random_state = 42,
                                                        shuffle = True,
                                                        stratify = d1['class'].tolist())

In [5]:
X2_train, X2_test, y2_train, y2_test = train_test_split(d2['tweet'].tolist(),
                                                        d2['class'].tolist(),
                                                        test_size = 0.2,
                                                        random_state = 42,
                                                        shuffle = True,
                                                        stratify = d2['class'].tolist())

In [6]:
X3_train, X3_test = d1['tweet'].tolist(), d2['tweet'].tolist()
y3_train, y3_test = d1['class'].tolist(), d2['class'].tolist()

In [7]:
X4_train, X4_test = d2['tweet'].tolist(), d1['tweet'].tolist()
y4_train, y4_test = d2['class'].tolist(), d1['class'].tolist()

# Define NLP Functions

In [8]:
def makeUnigram(train, test, total_features):
    
    train_unigram = CountVectorizer(max_features = total_features)
    train_matrix = train_unigram.fit_transform(train)
    
    test_unigram = CountVectorizer(vocabulary = train_unigram.get_feature_names())
    test_matrix = test_unigram.fit_transform(test)
    
    
    return train_matrix.toarray(), test_matrix.toarray()

In [9]:
def makeUnigramTfidf(train, test, total_features):
    train_tfidf = TfidfVectorizer(max_features = total_features)
    train_matrix = train_tfidf.fit_transform(train)
    
    test_tfidf = TfidfVectorizer(vocabulary = train_tfidf.get_feature_names())
    test_matrix = test_tfidf.fit_transform(test)
    
    return train_matrix.toarray(), test_matrix.toarray()

In [10]:
def makeBigram(train, test, total_features):
    train_bigram = CountVectorizer(ngram_range = (2,2), max_features = total_features)
    train_matrix = train_bigram.fit_transform(train)
    
    test_bigram = CountVectorizer(ngram_range = (2,2), vocabulary = train_bigram.get_feature_names())
    test_matrix = test_bigram.fit_transform(test)
    
    return train_matrix.toarray(), test_matrix.toarray()

In [11]:
def makeBigramTfidf(train, test, total_features):
    train_tfidf = TfidfVectorizer(ngram_range = (2,2), max_features = total_features)
    train_matrix = train_tfidf.fit_transform(train)
    
    test_tfidf = TfidfVectorizer(ngram_range = (2,2), vocabulary = train_tfidf.get_feature_names())
    test_matrix = test_tfidf.fit_transform(test)
    
    return train_matrix.toarray(), test_matrix.toarray()

In [12]:
def reduceDim(train, test):
    n = int(len(train[0])/20)
    tsvd = TruncatedSVD(n_components = n)
    
    rtrain = tsvd.fit_transform(train)
    rtest = tsvd.fit_transform(test)
    
    return rtrain, rtest

### Feautures from Unigram/TF-IDF Unigram:

Scenario 1) 11671 reduced to 9500

Scenario 2) 15000 reduced to 12000

Scenario 3) 13380 reduced to 11000

Scenario 4) 17704 reduced to 14000

### Feautures from Bigram/TF-IDF Bigram:

Scenario 1) 54205 reduced to 43000

Scenario 2) 88523 reduced to 71000

Scenario 3) 66368 reduced to 53000

Scenario 4) 107099 reduced to 86000

### Dimensionally Reduced Unigram/TF-IDF Unigram:

Scenario 1) 9500 reduced to 475

Scenario 2) 12000 reduced to 600

Scenario 3) 11000 reduced to 550

Scenario 4) 14000 reduced to 700

### Dimensionally Reduced Bigram/TF-IDF Bigram:

Scenario 1) 43000 reduced to 2150

Scenario 2) 71000 reduced to 3550

Scenario 3) 53000 reduced to 2650

Scenario 4) 86000 reduced to 4300

# Create Save Directories

In [13]:
os.mkdir('nlp')

In [14]:
os.mkdir('nlp/scenario_1')
os.mkdir('nlp/scenario_2')
os.mkdir('nlp/scenario_3')
os.mkdir('nlp/scenario_4')

In [15]:
os.mkdir('nlp/scenario_1/labels')
os.mkdir('nlp/scenario_2/labels')
os.mkdir('nlp/scenario_3/labels')
os.mkdir('nlp/scenario_4/labels')

In [16]:
os.mkdir('nlp/scenario_1/unigram')
os.mkdir('nlp/scenario_1/unigram_tfidf')
os.mkdir('nlp/scenario_1/bigram')
os.mkdir('nlp/scenario_1/bigram_tfidf')
os.mkdir('nlp/scenario_1/reduced_unigram')
os.mkdir('nlp/scenario_1/reduced_unigram_tfidf')
os.mkdir('nlp/scenario_1/reduced_bigram')
os.mkdir('nlp/scenario_1/reduced_bigram_tfidf')

In [None]:
os.mkdir('nlp/scenario_2/unigram')
os.mkdir('nlp/scenario_2/unigram_tfidf')
os.mkdir('nlp/scenario_2/bigram')
os.mkdir('nlp/scenario_2/bigram_tfidf')
os.mkdir('nlp/scenario_2/reduced_unigram')
os.mkdir('nlp/scenario_2/reduced_unigram_tfidf')
os.mkdir('nlp/scenario_2/reduced_bigram')
os.mkdir('nlp/scenario_2/reduced_bigram_tfidf')

In [None]:
os.mkdir('nlp/scenario_3/unigram')
os.mkdir('nlp/scenario_3/unigram_tfidf')
os.mkdir('nlp/scenario_3/bigram')
os.mkdir('nlp/scenario_3/bigram_tfidf')
os.mkdir('nlp/scenario_3/reduced_unigram')
os.mkdir('nlp/scenario_3/reduced_unigram_tfidf')
os.mkdir('nlp/scenario_3/reduced_bigram')
os.mkdir('nlp/scenario_3/reduced_bigram_tfidf')

In [None]:
os.mkdir('nlp/scenario_4/unigram')
os.mkdir('nlp/scenario_4/unigram_tfidf')
os.mkdir('nlp/scenario_4/bigram')
os.mkdir('nlp/scenario_4/bigram_tfidf')
os.mkdir('nlp/scenario_4/reduced_unigram')
os.mkdir('nlp/scenario_4/reduced_unigram_tfidf')
os.mkdir('nlp/scenario_4/reduced_bigram')
os.mkdir('nlp/scenario_4/reduced_bigram_tfidf')

In [17]:
def saveFile(scenario, nlp, name, data):
    filepath = 'nlp/scenario_%s/%s/%s.npy' % (scenario, nlp, name)
    np.save(filepath, data)

# Save Labels

In [18]:
saveFile(1, 'labels', 'train', y1_train)
saveFile(1, 'labels', 'test', y1_test)

In [None]:
saveFile(2, 'labels', 'train', y2_train)
saveFile(2, 'labels', 'test', y2_test)

In [None]:
saveFile(3, 'labels', 'train', y3_train)
saveFile(3, 'labels', 'test', y3_test)

In [None]:
saveFile(4, 'labels', 'train', y4_train)
saveFile(4, 'labels', 'test', y4_test)

# Create and Save Unigrams

In [19]:
train1_uni, test1_uni = makeUnigram(X1_train, X1_test, 9500)

In [None]:
train2_uni, test2_uni = makeUnigram(X2_train, X2_test, 12000)

In [None]:
train3_uni, test3_uni = makeUnigram(X3_train, X3_test, 11000)

In [None]:
train4_uni, test4_uni = makeUnigram(X4_train, X4_test, 14000)

In [20]:
saveFile(1, 'unigram', 'train', train1_uni)
saveFile(1, 'unigram', 'test', test1_uni)

In [None]:
saveFile(2, 'unigram', 'train', train2_uni)
saveFile(2, 'unigram', 'test', test2_uni)

In [None]:
saveFile(3, 'unigram', 'train', train3_uni)
saveFile(3, 'unigram', 'test', test3_uni)

In [None]:
saveFile(4, 'unigram', 'train', train4_uni)
saveFile(4, 'unigram', 'test', test4_uni)

# Create and Save Unigram - Tf-idf

In [21]:
train1_tfu, test1_tfu = makeUnigramTfidf(X1_train, X1_test, 9500)

In [None]:
train2_tfu, test2_tfu = makeUnigramTfidf(X2_train, X2_test, 12000)

In [None]:
train3_tfu, test3_tfu = makeUnigramTfidf(X3_train, X3_test, 11000)

In [None]:
train4_tfu, test4_tfu = makeUnigramTfidf(X4_train, X4_test, 14000)

In [22]:
saveFile(1, 'unigram_tfidf', 'train', train1_tfu)
saveFile(1, 'unigram_tfidf', 'test', test1_tfu)

In [None]:
saveFile(2, 'unigram_tfidf', 'train', train2_tfu)
saveFile(2, 'unigram_tfidf', 'test', test2_tfu)

In [None]:
saveFile(3, 'unigram_tfidf', 'train', train3_tfu)
saveFile(3, 'unigram_tfidf', 'test', test3_tfu)

In [None]:
saveFile(4, 'unigram_tfidf', 'train', train4_tfu)
saveFile(4, 'unigram_tfidf', 'test', test4_tfu)

# Create and Save Bigrams

In [23]:
train1_big, test1_big = makeBigram(X1_train, X1_test, 43000)

In [None]:
train2_big, test2_big = makeBigram(X2_train, X2_test, 71000)

In [None]:
train3_big, test3_big = makeBigram(X3_train, X3_test, 53000)

In [None]:
train4_big, test4_big = makeBigram(X4_train, X4_test, 86000)

In [24]:
saveFile(1, 'bigram', 'train', train1_big)
saveFile(1, 'bigram', 'test', test1_big)

In [None]:
saveFile(2, 'bigram', 'train', train2_big)
saveFile(2, 'bigram', 'test', test2_big)

In [None]:
saveFile(3, 'bigram', 'train', train3_big)
saveFile(3, 'bigram', 'test', test3_big)

In [None]:
saveFile(4, 'bigram', 'train', train4_big)
saveFile(4, 'bigram', 'test', test4_big)

# Create and Save Bigram - Tfidf

In [25]:
train1_tfb, test1_tfb = makeBigramTfidf(X1_train, X1_test, 43000)

In [None]:
train2_tfb, test2_tfb = makeBigramTfidf(X2_train, X2_test, 71000)

In [None]:
train3_tfb, test3_tfb = makeBigramTfidf(X3_train, X3_test, 53000)

In [None]:
train4_tfb, test4_tfb = makeBigramTfidf(X4_train, X4_test, 86000)

In [26]:
saveFile(1, 'bigram_tfidf', 'train', train1_tfb)
saveFile(1, 'bigram_tfidf', 'test', test1_tfb)

In [None]:
saveFile(2, 'bigram_tfidf', 'train', train2_tfb)
saveFile(2, 'bigram_tfidf', 'test', test2_tfb)

In [None]:
saveFile(3, 'bigram_tfidf', 'train', train3_tfb)
saveFile(3, 'bigram_tfidf', 'test', test3_tfb)

In [None]:
saveFile(4, 'bigram_tfidf', 'train', train4_tfb)
saveFile(4, 'bigram_tfidf', 'test', test4_tfb)

# Dimensionality Reduction: Unigram

In [27]:
train1_runi, test1_runi = reduceDim(train1_uni, test1_uni)

In [None]:
train2_runi, test2_runi = reduceDim(train2_uni, test2_uni)

In [None]:
train3_runi, test3_runi = reduceDim(train3_uni, test3_uni)

In [None]:
train4_runi, test4_runi = reduceDim(train4_uni, test4_uni)

In [28]:
saveFile(1, 'reduced_unigram', 'train', train1_runi)
saveFile(1, 'reduced_unigram', 'test', test1_runi)

In [None]:
saveFile(2, 'reduced_unigram', 'train', train2_runi)
saveFile(2, 'reduced_unigram', 'test', test2_runi)

In [None]:
saveFile(3, 'reduced_unigram', 'train', train3_runi)
saveFile(3, 'reduced_unigram', 'test', test3_runi)

In [None]:
saveFile(4, 'reduced_unigram', 'train', train4_runi)
saveFile(4, 'reduced_unigram', 'test', test4_runi)

# Dimensionality Reduction: Unigram - Tfidf

In [29]:
train1_rtfu, test1_rtfu = reduceDim(train1_tfu, test1_tfu)

In [None]:
train2_rtfu, test2_rtfu = reduceDim(train2_tfu, test2_tfu)

In [None]:
train3_rtfu, test3_rtfu = reduceDim(train3_tfu, test3_tfu)

In [None]:
train4_rtfu, test4_rtfu = reduceDim(train4_tfu, test4_tfu)

In [30]:
saveFile(1, 'reduced_unigram_tfidf', 'train', train1_rtfu)
saveFile(1, 'reduced_unigram_tfidf', 'test', test1_rtfu)

In [None]:
saveFile(2, 'reduced_unigram_tfidf', 'train', train2_rtfu)
saveFile(2, 'reduced_unigram_tfidf', 'test', test2_rtfu)

In [None]:
saveFile(3, 'reduced_unigram_tfidf', 'train', train3_rtfu)
saveFile(3, 'reduced_unigram_tfidf', 'test', test3_rtfu)

In [None]:
saveFile(4, 'reduced_unigram_tfidf', 'train', train4_rtfu)
saveFile(4, 'reduced_unigram_tfidf', 'test', test4_rtfu)

# Dimensionality Reduction: Bigram

In [31]:
train1_rbig, test1_rbig = reduceDim(train1_big, test1_big)

In [None]:
train2_rbig, test2_rbig = reduceDim(train2_big, test2_big)

In [None]:
train3_rbig, test3_rbig = reduceDim(train3_big, test3_big)

In [None]:
train4_rbig, test4_rbig = reduceDim(train4_big, test4_big)

In [33]:
saveFile(1, 'reduced_bigram', 'train', train1_rbig)
saveFile(1, 'reduced_bigram', 'test', test1_rbig)

In [None]:
saveFile(2, 'reduced_bigram', 'train', train2_rbig)
saveFile(2, 'reduced_bigram', 'test', test2_rbig)

In [None]:
saveFile(3, 'reduced_bigram', 'train', train3_rbig)
saveFile(3, 'reduced_bigram', 'test', test3_rbig)

In [None]:
saveFile(4, 'reduced_bigram', 'train', train4_rbig)
saveFile(4, 'reduced_bigram', 'test', test4_rbig)

# Dimensionality Reduction: Bigram - Tfidf

In [32]:
train1_rtfb, test1_rtfb = reduceDim(train1_tfb, test1_tfb)

In [None]:
train2_rtfb, test2_rtfb = reduceDim(train2_tfb, test2_tfb)

In [None]:
train3_rtfb, test3_rtfb = reduceDim(train3_tfb, test3_tfb)

In [None]:
train4_rtfb, test4_rtfb = reduceDim(train4_tfb, test4_tfb)

In [34]:
saveFile(1, 'reduced_bigram_tfidf', 'train', train1_rtfb)
saveFile(1, 'reduced_bigram_tfidf', 'test', test1_rtfb)

In [None]:
saveFile(2, 'reduced_bigram_tfidf', 'train', train2_rtfb)
saveFile(2, 'reduced_bigram_tfidf', 'test', test2_rtfb)

In [None]:
saveFile(3, 'reduced_bigram_tfidf', 'train', train3_rtfb)
saveFile(3, 'reduced_bigram_tfidf', 'test', test3_rtfb)

In [None]:
saveFile(4, 'reduced_bigram_tfidf', 'train', train4_rtfb)
saveFile(4, 'reduced_bigram_tfidf', 'test', test4_rtfb)