In [1]:
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn import linear_model

from utilities import printConfusionMatrix

Question 1 (5 points) 

Based on the analysis of the document, create a term-document matrix and a concept matrix. Limit the number of concepts to 20.

Examine the term-document matrix 

Is it sparse or dense?

Answer: the term-document matrix is sparse

Look at the first row of the term-document matrix and determine the meaning of the non-zero elements.

Answer: the first row non-zero elements means the ad that has at least one of the term 'aa'

In [2]:
corpus = []
label = []
DATA = Path('.').resolve().parent/'data'
farmads_df = pd.read_csv(DATA/'farm-ads.csv', header=None, names=['label', 'text'])
farmads_df.loc[farmads_df.label == -1, 'label'] = 0
label = list(farmads_df.label)
corpus = list(farmads_df.text)
preprocessor = CountVectorizer()
preprocessedText = preprocessor.fit_transform(corpus)
termDocumentMatrix = pd.DataFrame(data=preprocessedText.toarray().transpose(), 
                                  index=preprocessor.get_feature_names())
print('Term-document matrix: {0[1]} terms, {0[0]} documents'.format(preprocessedText.shape))
print(' sparsity: {:.3f}%\n'.format(100 * preprocessedText.count_nonzero() / (preprocessedText.shape[0] * preprocessedText.shape[1])))
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(preprocessedText)
# Extract 20 concepts using LSA ()
svd = TruncatedSVD(20)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
lsa_tfidf = lsa.fit_transform(tfidf)
conceptMatrix = pd.DataFrame(data=lsa_tfidf.transpose())
print(termDocumentMatrix.iloc[:1, :])
print(conceptMatrix.head())

Term-document matrix: 47513 terms, 4143 documents
 sparsity: 0.307%

    0     1     2     3     4     5     6     7     8     9     ...   4133  \
aa     0     0     0     0     0     0     0     0     0     0  ...      0   

    4134  4135  4136  4137  4138  4139  4140  4141  4142  
aa     0     0     0     0     0     0     0     0     0  

[1 rows x 4143 columns]
       0         1         2         3         4         5         6     \
0  0.912599  0.907055  0.893060  0.807469  0.857183  0.749431  0.898271   
1 -0.199553 -0.129813 -0.061585 -0.052199 -0.026312  0.058710 -0.219140   
2  0.246859 -0.003975 -0.160853 -0.310317 -0.226288 -0.500336  0.177987   
3 -0.052602  0.124772  0.046488  0.247595  0.030087  0.101156  0.094909   
4 -0.085492 -0.044585  0.014246 -0.134030 -0.166928 -0.120797 -0.048167   

       7         8         9       ...         4133      4134      4135  \
0  0.632276  0.919159  0.903463    ...     0.854086  0.853765  0.621795   
1  0.182430 -0.130588 -0.10172

Question 2 (4 points)
Brieﬂy explain the difference between the term-document matrix and the concept-document matrix.

Answer: Both term-document matrix and concept-document matrix have rows for terms and columns for documents.
But the term-document matrix terms is excessive for effective
model-building, so the preprocessing steps include vocabulary reduction. The concept-document matrix limits set
of concepts that represents most of the variation in the documents.
we are ready to use this concept-matrix
for classifying documents using classification methods.


Question 3 (8 points)
Using logistic regression, partition the data (60% training, 40% validation), and develop a model to classify the documents as ‘relevant’ or ‘non-relevant.’ Comment on its efficacy. 



In [3]:
# split dataset into 60% training and 40% test set
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(lsa_tfidf, label, test_size=0.4, random_state=12345)

# run logistic regression model on training
logit_reg = linear_model.LogisticRegression()
logit_reg.fit(Xtrain, ytrain)

# print confusion matrix and accuracty
printConfusionMatrix(ytest, logit_reg.predict(Xtest))

Confusion Matrix (Accuracy 0.8770)

         Prediction
Reference   0   1
        0 653 124
        1  80 801


Question 4 (3 points)
Why use the concept-document matrix, and not the term-document matrix, to provide the predictor variables?

Answer: The concept-document matrix limits set of concepts that represents most of the variation in the documents. But the term-document matrix has no vocabulary reduction etc, it is super sparse. 