In [1]:
from pathlib import Path

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize          
from nltk.stem.snowball import EnglishStemmer 
import matplotlib.pylab as plt

import dmba
from dmba import printTermDocumentMatrix, classificationSummary, liftChart


In [2]:
df = pd.read_csv(r"C:\Users\oscar\Downloads\farm-ads.csv")

# Assign column names
df.columns = ['Label', 'Text']

In [3]:
df['Label'] = df['Label'].map({-1: 0, 1: 1})

In [4]:
df.head()

Unnamed: 0,Label,Text
0,0,ad-abdominal ad-aortic ad-aneurysm ad-doctorf...
1,0,ad-abdominal ad-aortic ad-aneurysm ad-million...
2,0,ad-absorbent ad-oil ad-snar ad-factory ad-dir...
3,0,ad-acid ad-reflux ad-relief ad-top ad-treatme...
4,0,ad-acid ad-reflux ad-symptom ad-acid ad-reflu...


In [5]:
# Learn features based on text
count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['Text'])

# Apply TF-IDF Normalization
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(counts)

# Dimensionality Reduction to create a Concept-Document Matrix
svd_model = TruncatedSVD(n_components=20, random_state=42)
X_concept = svd_model.fit_transform(X_tfidf)


In [6]:
import numpy as np
import pandas as pd

# Assuming 'counts' is the Term-Document matrix you have from the CountVectorizer
# Convert it to a dense format because the TDM is typically sparse
dense_counts = counts.toarray()

# Convert to DataFrame for easier handling
terms = count_vect.get_feature_names_out()  # This gets the list of terms in the order they're stored in the matrix
tdm_df = pd.DataFrame(dense_counts, columns=terms)

# Sum up each term's occurrences across all documents and sort them to find the most frequent terms
most_frequent_terms = tdm_df.sum(axis=0).sort_values(ascending=False).head(10)  # Adjust as necessary to see more terms

# Print the most frequent terms
print("Most frequently appearing terms in the corpus:")
print(most_frequent_terms)

# Print the entries for 'pet' and 'health'
print(f"Entries for 'pet':\n{tdm_df['pet']}")
print(f"Entries for 'health':\n{tdm_df['health']}")

# Print the total counts for 'pet' and 'health'
print(f"Total count for 'pet': {tdm_df['pet'].sum()}")
print(f"Total count for 'health': {tdm_df['health'].sum()}")

Most frequently appearing terms in the corpus:
ad         45437
title      23542
header     14509
list       11228
product    11154
com        10704
pet         7057
health      6202
home        5870
free        5854
dtype: int64
Entries for 'pet':
0        0
1        0
2        0
3        0
4        0
        ..
4138    53
4139    53
4140    53
4141    49
4142    53
Name: pet, Length: 4143, dtype: int64
Entries for 'health':
0        0
1        0
2        0
3        0
4        1
        ..
4138    10
4139     9
4140     9
4141     9
4142     9
Name: health, Length: 4143, dtype: int64
Total count for 'pet': 7057
Total count for 'health': 6202


The chosen 2 entries are pet and health.</br>
The first non-zero entry is 'pet', indicates that the term 'pet' appears 53 time in the 4138th document. The second non-zero entry is 'health', indicates that the term 'health' appears 10 time in the 4138th document. It may show that pet health is the main focus of ad 4138.</br>
By looking at the frequncy table, pet and health are both top 10 most frequent appeared terms in the dataset, which means that the ad in the dataset are mainly related to pet and health</br>

In [7]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc) 
                if t.isalpha() and t not in self.stopWords]

preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(), encoding='latin1')
preprocessedText = preprocessor.fit_transform(df['Text'])



In [8]:
# Step 3: TF-IDF and latent semantic analysis
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(preprocessedText)

# Extract 20 concepts using LSA ()
svd = TruncatedSVD(20)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_tfidf = lsa.fit_transform(tfidf)

In [9]:
label = df['Label']

# split dataset into 60% training and 40% test set
Xtrain, Xtest, ytrain, ytest = train_test_split(lsa_tfidf, label, test_size=0.25, random_state=42)

# run logistic regression model on training
logit_reg = LogisticRegression(solver='lbfgs')
logit_reg.fit(Xtrain, ytrain)

# print confusion matrix and accuracty
classificationSummary(ytest, logit_reg.predict(Xtest))

Confusion Matrix (Accuracy 0.7992)

       Prediction
Actual   0   1
     0 346 158
     1  50 482


In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming 'X_concept' is your Concept-Document Matrix and 'label' contains your target labels

# Split dataset into 75% training and 25% test set
X_train, X_test, y_train, y_test = train_test_split(X_concept, label, test_size=0.25, random_state=42)

# Run logistic regression model on training set
logit_reg = LogisticRegression(solver='lbfgs', max_iter=1000)  # Increased max_iter for convergence if needed

# Perform cross-validation
cv_scores = cross_val_score(logit_reg, X_train, y_train, cv=5)

logit_reg.fit(X_train, y_train)

# Predict on test set
y_pred = logit_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score and the cross-validation scores
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores)}')
print(f'Accuracy Score on the test set: {accuracy}')

Cross-validation scores: [0.85530547 0.88906752 0.87439614 0.88083736 0.86634461]
Mean CV accuracy: 0.8731902180385335
Accuracy Score on the test set: 0.859073359073359
