In [81]:
# Import the necessary libraries and packages

import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [82]:
# Read the dataset

df = pd.read_csv('comcast_consumeraffairs_complaints.csv')

In [83]:
# Check the dataset's general info

df.head()

Unnamed: 0,author,posted_on,rating,text
0,"Alantae of Chesterfeild, MI","Nov. 22, 2016",1,I used to love Comcast. Until all these consta...
1,"Vera of Philadelphia, PA","Nov. 19, 2016",1,I'm so over Comcast! The worst internet provid...
2,"Sarah of Rancho Cordova, CA","Nov. 17, 2016",1,If I could give them a negative star or no sta...
3,"Dennis of Manchester, NH","Nov. 16, 2016",1,I've had the worst experiences so far since in...
4,"Ryan of Bellevue, WA","Nov. 14, 2016",1,Check your contract when you sign up for Comca...


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5659 entries, 0 to 5658
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   author     5659 non-null   object
 1   posted_on  5659 non-null   object
 2   rating     5659 non-null   int64 
 3   text       5629 non-null   object
dtypes: int64(1), object(3)
memory usage: 177.0+ KB


In [85]:
df.describe()

Unnamed: 0,rating
count,5659.0
mean,0.822053
std,0.669991
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,5.0


In [86]:
# Check if there is missing data

df.isnull().sum()

author        0
posted_on     0
rating        0
text         30
dtype: int64

In [87]:
df['posted_on'] = pd.to_datetime(df['posted_on'], errors='coerce', infer_datetime_format=True) #convert to datetime

  df['posted_on'] = pd.to_datetime(df['posted_on'], errors='coerce', infer_datetime_format=True) #convert to datetime


In [88]:
# Delete the rows with missing data 

df = df.dropna(subset=['posted_on'])

In [89]:
# Delete the complaints prior to 2009

df = df[df['posted_on'].dt.year >= 2009]

In [90]:
# Check the dataset after the process

df.dropna(subset=['text'], inplace=True)
df.isnull().sum()

author       0
posted_on    0
rating       0
text         0
dtype: int64

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2607 entries, 0 to 5200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   author     2607 non-null   object        
 1   posted_on  2607 non-null   datetime64[ns]
 2   rating     2607 non-null   int64         
 3   text       2607 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 101.8+ KB


In [92]:
# Tokenize the text using NLTK's word_tokenize() function
df['tokenized_text'] = df['text'].apply(word_tokenize) # Apply it to all rows


In [93]:
# Check the tokenization
print(df['tokenized_text'].head()) 

0    [I, used, to, love, Comcast, ., Until, all, th...
1    [I, 'm, so, over, Comcast, !, The, worst, inte...
2    [If, I, could, give, them, a, negative, star, ...
3    [I, 've, had, the, worst, experiences, so, far...
4    [Check, your, contract, when, you, sign, up, f...
Name: tokenized_text, dtype: object


In [94]:
df.head() # Now we have an additional column but we'll delete it afterwards

Unnamed: 0,author,posted_on,rating,text,tokenized_text
0,"Alantae of Chesterfeild, MI",2016-11-22,1,I used to love Comcast. Until all these consta...,"[I, used, to, love, Comcast, ., Until, all, th..."
1,"Vera of Philadelphia, PA",2016-11-19,1,I'm so over Comcast! The worst internet provid...,"[I, 'm, so, over, Comcast, !, The, worst, inte..."
2,"Sarah of Rancho Cordova, CA",2016-11-17,1,If I could give them a negative star or no sta...,"[If, I, could, give, them, a, negative, star, ..."
3,"Dennis of Manchester, NH",2016-11-16,1,I've had the worst experiences so far since in...,"[I, 've, had, the, worst, experiences, so, far..."
4,"Ryan of Bellevue, WA",2016-11-14,1,Check your contract when you sign up for Comca...,"[Check, your, contract, when, you, sign, up, f..."


In [95]:
# Download NLTK stopwords
nltk.download('stopwords')

# Get the list
stop_words = set(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [96]:
# Remove stopwords function
def remove_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stop_words]


In [97]:
# Remove stopwords from the 'tokenized_text' column
df['tokenized_text'] = df['tokenized_text'].apply(remove_stopwords) # Apply to all columns

In [98]:
# Check the removal of stopwords
print(df['tokenized_text'].head())

0    [used, love, Comcast, ., constant, updates, .,...
1    ['m, Comcast, !, worst, internet, provider, .,...
2    [could, give, negative, star, stars, review, w...
3    ['ve, worst, experiences, far, since, install,...
4    [Check, contract, sign, Comcast, advertised, o...
Name: tokenized_text, dtype: object


In [99]:
df.head()

Unnamed: 0,author,posted_on,rating,text,tokenized_text
0,"Alantae of Chesterfeild, MI",2016-11-22,1,I used to love Comcast. Until all these consta...,"[used, love, Comcast, ., constant, updates, .,..."
1,"Vera of Philadelphia, PA",2016-11-19,1,I'm so over Comcast! The worst internet provid...,"['m, Comcast, !, worst, internet, provider, .,..."
2,"Sarah of Rancho Cordova, CA",2016-11-17,1,If I could give them a negative star or no sta...,"[could, give, negative, star, stars, review, w..."
3,"Dennis of Manchester, NH",2016-11-16,1,I've had the worst experiences so far since in...,"['ve, worst, experiences, far, since, install,..."
4,"Ryan of Bellevue, WA",2016-11-14,1,Check your contract when you sign up for Comca...,"[Check, contract, sign, Comcast, advertised, o..."


In [100]:
# Initialize the PorterStemmer
porter_stemmer = PorterStemmer()

In [101]:
# Stemming function
def perform_stemming(tokens):
    return [porter_stemmer.stem(token) for token in tokens]

In [102]:
# Perform stemming on the 'tokenized_text' column
df['tokenized_text'] = df['tokenized_text'].apply(perform_stemming)

In [103]:
# Check the stemming
print(df['tokenized_text'].head())

0    [use, love, comcast, ., constant, updat, ., in...
1    ['m, comcast, !, worst, internet, provid, ., '...
2    [could, give, neg, star, star, review, would, ...
3    ['ve, worst, experi, far, sinc, instal, 10/4/1...
4    [check, contract, sign, comcast, advertis, off...
Name: tokenized_text, dtype: object


In [104]:
df.head()

Unnamed: 0,author,posted_on,rating,text,tokenized_text
0,"Alantae of Chesterfeild, MI",2016-11-22,1,I used to love Comcast. Until all these consta...,"[use, love, comcast, ., constant, updat, ., in..."
1,"Vera of Philadelphia, PA",2016-11-19,1,I'm so over Comcast! The worst internet provid...,"['m, comcast, !, worst, internet, provid, ., '..."
2,"Sarah of Rancho Cordova, CA",2016-11-17,1,If I could give them a negative star or no sta...,"[could, give, neg, star, star, review, would, ..."
3,"Dennis of Manchester, NH",2016-11-16,1,I've had the worst experiences so far since in...,"['ve, worst, experi, far, sinc, instal, 10/4/1..."
4,"Ryan of Bellevue, WA",2016-11-14,1,Check your contract when you sign up for Comca...,"[check, contract, sign, comcast, advertis, off..."


In [105]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [106]:
# Lemmatization function
def perform_lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [107]:
# Perform lemmatization on the 'tokenized_text' column
df['tokenized_text'] = df['tokenized_text'].apply(perform_lemmatization)

In [108]:
# Check the lemmatization
print(df['tokenized_text'].head())

0    [use, love, comcast, ., constant, updat, ., in...
1    ['m, comcast, !, worst, internet, provid, ., '...
2    [could, give, neg, star, star, review, would, ...
3    ['ve, worst, experi, far, sinc, instal, 10/4/1...
4    [check, contract, sign, comcast, advertis, off...
Name: tokenized_text, dtype: object


In [109]:
df.head()

Unnamed: 0,author,posted_on,rating,text,tokenized_text
0,"Alantae of Chesterfeild, MI",2016-11-22,1,I used to love Comcast. Until all these consta...,"[use, love, comcast, ., constant, updat, ., in..."
1,"Vera of Philadelphia, PA",2016-11-19,1,I'm so over Comcast! The worst internet provid...,"['m, comcast, !, worst, internet, provid, ., '..."
2,"Sarah of Rancho Cordova, CA",2016-11-17,1,If I could give them a negative star or no sta...,"[could, give, neg, star, star, review, would, ..."
3,"Dennis of Manchester, NH",2016-11-16,1,I've had the worst experiences so far since in...,"['ve, worst, experi, far, sinc, instal, 10/4/1..."
4,"Ryan of Bellevue, WA",2016-11-14,1,Check your contract when you sign up for Comca...,"[check, contract, sign, comcast, advertis, off..."


In [110]:
# Update the 'text' column with the preprocessed tokenized text
df['text'] = df['tokenized_text'].apply(lambda x: ' '.join(x))



In [111]:
# Delete the 'tokenized_text' column
df.drop(columns=['tokenized_text'], inplace=True)

In [112]:
df.head() # Now we don't have the extra column

Unnamed: 0,author,posted_on,rating,text
0,"Alantae of Chesterfeild, MI",2016-11-22,1,use love comcast . constant updat . internet c...
1,"Vera of Philadelphia, PA",2016-11-19,1,'m comcast ! worst internet provid . 'm take o...
2,"Sarah of Rancho Cordova, CA",2016-11-17,1,could give neg star star review would . never ...
3,"Dennis of Manchester, NH",2016-11-16,1,'ve worst experi far sinc instal 10/4/16 . not...
4,"Ryan of Bellevue, WA",2016-11-14,1,check contract sign comcast advertis offer mat...


In [113]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

In [114]:
# Fit and transform the 'text' column to create the term-by-document matrix
term_document_matrix = vectorizer.fit_transform(df['text'])

In [115]:
# Convert the term-document matrix to an array
term_document_matrix_array = term_document_matrix.toarray()

In [116]:
# Check the shape
print("Shape of term-document matrix:", term_document_matrix_array.shape)

Shape of term-document matrix: (2607, 7934)


In [117]:
# Normalize the matrix
normalized_term_document_matrix = term_document_matrix_array / np.linalg.norm(term_document_matrix_array)


In [118]:
# Check the shape
print("Shape of normalized term-document matrix:", normalized_term_document_matrix.shape)

Shape of normalized term-document matrix: (2607, 7934)


In [119]:
# Perform SVD
U, Sigma, VT = np.linalg.svd(normalized_term_document_matrix)

# Print the matrices U, Sigma, and VT
print("Matrix U:")
print(U)
print("Matrix Sigma:")
print(Sigma)
print("Matrix VT:")
print(VT)



# I tried to do this but it's not functioning properly so to continue the next steps i used built in libarary but i added it in the following comments


Matrix U:
[[-3.08107501e-03  2.38490206e-03 -3.29628042e-03 ...  6.56072337e-05
  -1.58178300e-03 -1.00796317e-17]
 [-4.34116050e-03  8.42357239e-03 -1.03542179e-02 ... -1.60692204e-03
   1.11890415e-02  1.12438213e-16]
 [-1.44585222e-02  2.54317331e-02 -1.48559889e-02 ... -1.20902347e-03
   4.06331859e-04 -9.48542281e-17]
 ...
 [-6.88218660e-03  6.56142495e-03  1.24844916e-03 ...  1.50044777e-02
   8.61013219e-03 -1.77180452e-16]
 [-4.92560748e-03  9.15698747e-03  1.13520531e-02 ... -6.42198676e-03
  -1.98228210e-03 -1.57132766e-16]
 [-5.37214286e-03  1.15579034e-02 -1.07828141e-02 ... -5.26436931e-04
  -7.08056084e-04  5.08037447e-17]]
Matrix Sigma:
[5.47667485e-01 1.72734912e-01 1.44852707e-01 ... 2.81546202e-04
 2.19187675e-04 2.83689612e-18]
Matrix VT:
[[-3.59234471e-02 -7.99469065e-04 -6.84921791e-04 ... -1.25599876e-05
  -1.22440829e-04 -1.48536295e-04]
 [ 3.82596622e-02  2.93602778e-03  1.14416994e-04 ... -3.21962556e-05
   1.62824589e-04 -1.32576877e-04]
 [ 4.64851584e-02 -1.3

'\n    def center_data(matrix): # centering data function\n    mean = np.mean(matrix, axis=0)\n    centered_matrix = matrix - mean\n    return centered_matrix, mean\n\ndef power_iteration(A, num_iterations):\n    n = A.shape[0]\n    v = np.random.rand(n)  # Random initialization of eigenvector\n    for _ in range(num_iterations): # range according to the input parameter\n        v = A @ v\n        v /= np.linalg.norm(v)  # Normalize the vector\n    eigenvalue = np.dot(A @ v, v) / np.dot(v, v)  # Rayleigh quotient approximation\n    return eigenvalue, v\n\n\ndef svd(matrix, num_iterations=10): # SVD function using the above functions\n    # Center the data\n    centered_matrix, mean = center_data(matrix)\n\n    # Compute the Covariance Matrix\n    covariance_matrix = np.cov(centered_matrix, rowvar=False)\n\n    # Power Iteration\n    eigenvalues = [] # define the esgenvalues and eigenvectors\n    eigenvectors = []\n    for _ in range(matrix.shape[1]):\n        eigenvalue, eigenvector = 

In [120]:
'''
def center_data(matrix): # centering data function
    mean = np.mean(matrix, axis=0)
    centered_matrix = matrix - mean
    return centered_matrix, mean

def power_iteration(A, num_iterations):
    n = A.shape[0]
    v = np.random.rand(n)  # Random initialization of eigenvector
    for _ in range(num_iterations): # range according to the input parameter
        v = A @ v
        v /= np.linalg.norm(v)  # Normalize the vector
    eigenvalue = np.dot(A @ v, v) / np.dot(v, v)  # Rayleigh quotient approximation
    return eigenvalue, v
'''

'def center_data(matrix):\n    mean = np.mean(matrix, axis=0)\n    centered_matrix = matrix - mean\n    return centered_matrix, mean\n\ndef power_iteration(A, num_iterations):\n    n = A.shape[0]\n    v = np.random.rand(n)  # Random initialization of eigenvector\n    for _ in range(num_iterations):\n        v = A @ v\n        v /= np.linalg.norm(v)  # Normalize the vector\n    eigenvalue = np.dot(A @ v, v) / np.dot(v, v)  # Rayleigh quotient approximation\n    return eigenvalue, v'

In [121]:
'''
def svd(matrix, num_iterations=10): # SVD function using the above functions
    # Center the data
    centered_matrix, mean = center_data(matrix)

    # Compute the Covariance Matrix
    covariance_matrix = np.cov(centered_matrix, rowvar=False)

    # Power Iteration
    eigenvalues = [] # define the esgenvalues and eigenvectors
    eigenvectors = []
    for _ in range(matrix.shape[1]):
        eigenvalue, eigenvector = power_iteration(covariance_matrix, num_iterations)
        eigenvalues.append(eigenvalue)
        eigenvectors.append(eigenvector)

        covariance_matrix -= eigenvalue * np.outer(eigenvector, eigenvector)

    # Create Matrix U and Sigma
    U = np.array(eigenvectors).T
    Sigma = np.diag(np.sqrt(eigenvalues))

    # Compute Matrix V
    V = np.dot(centered_matrix.T, U) / np.diag(Sigma)

    return U, Sigma, V # return the values

'''

'def svd(matrix, num_iterations=10):\n    # Step 1: Center the data\n    centered_matrix, mean = center_data(matrix)\n\n    # Step 2: Compute the Covariance Matrix\n    covariance_matrix = np.cov(centered_matrix, rowvar=False)\n\n    # Step 3: Power Iteration for Principal Component\n    eigenvalues = []\n    eigenvectors = []\n    for _ in range(matrix.shape[1]):\n        eigenvalue, eigenvector = power_iteration(covariance_matrix, num_iterations)\n        eigenvalues.append(eigenvalue)\n        eigenvectors.append(eigenvector)\n\n        # Deflation\n        covariance_matrix -= eigenvalue * np.outer(eigenvector, eigenvector)\n\n    # Step 4: Construct Matrix U and Sigma\n    U = np.array(eigenvectors).T\n    Sigma = np.diag(np.sqrt(eigenvalues))\n\n    # Step 5: Compute Matrix V\n    V = np.dot(centered_matrix.T, U) / np.diag(Sigma)\n\n    return U, Sigma, V'

In [122]:
'''
# Perform Singular Value Decomposition
U, Sigma, VT = svd(normalized_term_document_matrix)

'''

'# Perform Singular Value Decomposition\nU, Sigma, VT = svd(normalized_term_document_matrix)\n'

In [123]:
# Check the shapes of U, Sigma, and VT matrices
print("Shape of U matrix:", U.shape)
print("Shape of Sigma matrix:", Sigma.shape)
print("Shape of VT matrix:", VT.shape)

Shape of U matrix: (2607, 2607)
Shape of Sigma matrix: (2607,)
Shape of VT matrix: (7934, 7934)


In [124]:
# Define a range
k_values = range(10, min(normalized_term_document_matrix.shape) // 10 + 1, 20)


In [125]:
# Initialize variabless for storing k's
mse_errors = []
fn_errors = []

In [126]:
# Calculate reconstruction errors for ks
for k in k_values:
    # Approximate the original matrix
    reconstructed_matrix = U[:, :k] @ np.diag(Sigma[:k]) @ VT[:k, :]
    
    # Calculate Mean Squared Error
    mse = np.mean((normalized_term_document_matrix - reconstructed_matrix) ** 2)
    mse_errors.append(mse)
    
    # Calculate Frobenius Norm
    fn = np.linalg.norm(normalized_term_document_matrix - reconstructed_matrix)
    fn_errors.append(fn)

In [127]:
# Find the index of the minimum MSE and FN
min_mse_index = np.argmin(mse_errors)
min_fn_index = np.argmin(fn_errors)

# Optimal k based on least MSE and FN errors
optimal_k_mse = k_values[min_mse_index]
optimal_k_fn = k_values[min_fn_index]

In [128]:
print("Optimal k based on least MSE:", optimal_k_mse)
print("Optimal k based on least FN:", optimal_k_fn)

Optimal k based on least MSE: 250
Optimal k based on least FN: 250


In [129]:
# Reconstruct the original matrix
reconstructed_matrix = U[:, :optimal_k_mse] @ np.diag(Sigma[:optimal_k_mse]) @ VT[:optimal_k_mse, :]

# Calculate the reconstruction error
mse_reconstruction_error = np.mean((normalized_term_document_matrix - reconstructed_matrix) ** 2)
fn_reconstruction_error = np.linalg.norm(normalized_term_document_matrix - reconstructed_matrix)

print("MSE Reconstruction Error:", mse_reconstruction_error)
print("FN Reconstruction Error:", fn_reconstruction_error)

# the errors are pretty small so it is good 


MSE Reconstruction Error: 6.857711594030514e-09
FN Reconstruction Error: 0.37662246538517613


In [130]:
optimal_k = 250 # assume our k accoring to the calculations

In [131]:
# Query vectors from the question
queries = {
    'query1': ['ignorant', 'overwhelming'],
    'query2': ['xfinity', 'frustrate', 'adapter', 'verizon', 'router'],
    'query3': ['terminate', 'rent', 'promotion', 'joke', 'liar', 'internet', 'horrible'],
    'query4': ['kindergarten', 'ridiculous', 'internet', 'clerk', 'terrible']
}

In [132]:
# Now Converting queries into query vectors
query_vectors = {}
for query_name, query_terms in queries.items():
    query_vector = np.zeros(len(vectorizer.vocabulary_), dtype=int)
    for term in query_terms:
        if term in vectorizer.vocabulary_:
            query_vector[vectorizer.vocabulary_[term]] = 1
    query_vectors[query_name] = query_vector

In [133]:
# Calculate cosine similarity
cosine_similarities = {}
for query_name, query_vector in query_vectors.items():
    cosine_similarities[query_name] = np.dot(query_vector, normalized_term_document_matrix.T) / (
            np.linalg.norm(query_vector) * np.linalg.norm(normalized_term_document_matrix, axis=1))


  cosine_similarities[query_name] = np.dot(query_vector, normalized_term_document_matrix.T) / (


In [134]:
# Finding the most relevant document for each query
most_relevant_documents = {}
for query_name, similarity_scores in cosine_similarities.items():
    most_relevant_document_index = np.argmax(similarity_scores)
    most_relevant_documents[query_name] = df.iloc[most_relevant_document_index]['text']


In [138]:
# Print the most relevant document for each query
for query_name, document_text in most_relevant_documents.items():
    print(f"Most relevant document for {query_name}:")
    print(document_text)
    print("***-----------------------------------------------------------------------------------------***")

Most relevant document for query1:
use love comcast . constant updat . internet cabl crash lot night , sometim day , channel n't even work demand sometim n't play either . wish someth . min ago , internet crash 20 min reason . 'm tire think switch wow someth . plea get xfiniti .
***-----------------------------------------------------------------------------------------***
Most relevant document for query2:
cancel account novemb . bring back equip , told order router septemb need return . state n't order receiv . said evid receiv up ( without signatur ) . say router never activ . iron , purchas router issu month earlier never offer free router . speak yolanda * * , account manag comcast , said worri ask receipt router purchas agre reimburs $ 139 last month plu extra $ 50 router receiv check within 4-6 weeks.aft wait 7 without check , call told need file claim up lost router would send check . one comcast said need call up 12/27 told comcast 's respons contact up , mine . n't feel comca