In [13]:
import matplotlib.pyplot as plt
import pandas as pd
from scipy.sparse import csr_matrix

In [14]:
dataset_path = ("keyword_issue.xlsx")
data = pd.read_excel(dataset_path)

In [15]:
# Selecting only the key_1, key_2, key_3 columns
selected_data = data[['key_1', 'key_2', 'key_3']]

# Function to create a bipartite matrix with initial 0 values for each word
def create_bipartite_matrix_zero_initialized(dataframe):
    # Extract all unique keywords from the dataframe
    unique_keywords = pd.unique(dataframe.values.ravel('K'))
    unique_keywords = unique_keywords[pd.notna(unique_keywords)]  # Remove NaN values

    # Create a dictionary to map each keyword to a column index
    keyword_to_index = {keyword: i for i, keyword in enumerate(unique_keywords)}

    # Initialize a matrix of zeros
    #matrix = csr_matrix((len(dataframe), len(unique_keywords)), dtype=int)
    matrix = csr_matrix((len(unique_keywords), len(unique_keywords)), dtype=int)


    # Iterate through the dataframe and update the matrix
    for row_idx, row in dataframe.iterrows():
        for keyword in row:
            if pd.notna(keyword):  # Check if the keyword is not NaN
                col_idx = keyword_to_index[keyword]
                matrix[row_idx, col_idx] = 1  # Set the corresponding matrix cell to 1

    return matrix, list(unique_keywords), keyword_to_index

# Create the bipartite matrix
bipartite_matrix_zero_initialized, unique_keywords, keyword_to_index = create_bipartite_matrix_zero_initialized(selected_data)

# Display some information about the matrix
matrix_zero_info = {
    'Shape': bipartite_matrix_zero_initialized.shape,
    'Number of Unique Keywords': len(unique_keywords),
    'Sample Unique Keywords': unique_keywords[:10]  # Display the first 10 keywords as a sample
}
matrix_zero_info


  self._set_intXint(row, col, x.flat[0])


{'Shape': (123, 123),
 'Number of Unique Keywords': 123,
 'Sample Unique Keywords': ['flare package',
  'permohonan perubahan kedua',
  'penanganan limbah cair',
  'process',
  'pembiayaan',
  'undangan',
  'change notice',
  'change infrastructure facility',
  'treatment',
  'items']}

In [16]:
# load data document
dataset_path = ("surat_ctr_format_clean.xlsx")
df = pd.read_excel(dataset_path)
df_text = df["Title"] +". "+ df["letter_content "]

In [17]:
#check hubungan bipartite matrix dalam df_text

# Function to update the bipartite matrix based on keyword co-occurrence in documents
def update_matrix_with_keyword_cooccurrence(df_text, bipartite_matrix, unique_keywords, keyword_to_index):
    for doc in df_text:
        # Split the document into words for keyword matching
        words_in_doc = set(word for word in doc.split())

        # Check for co-occurrence of each pair of keywords
        for i, keyword1 in enumerate(unique_keywords):
            for j, keyword2 in enumerate(unique_keywords):
                if i != j and keyword1 in words_in_doc and keyword2 in words_in_doc:
                    # Update the bipartite matrix
                    bipartite_matrix[i, j] = 1
                    bipartite_matrix[j, i] = 1  # Since the matrix is symmetric

# Update the bipartite matrix based on the df_text
update_matrix_with_keyword_cooccurrence(df_text, bipartite_matrix_zero_initialized, unique_keywords, keyword_to_index)

# Display the updated matrix information
updated_matrix_info = {
    'Updated Shape': bipartite_matrix_zero_initialized.shape,
    'Number of Unique Keywords': len(unique_keywords),
}
updated_matrix_info


  self._set_intXint(row, col, x.flat[0])


{'Updated Shape': (123, 123), 'Number of Unique Keywords': 123}

In [18]:
# save bipartite matrix to excel file

# Convert the bipartite matrix to a dense format
dense_matrix = bipartite_matrix_zero_initialized.toarray()

# Create a DataFrame from the dense matrix
df_matrix = pd.DataFrame(data=dense_matrix, index=unique_keywords, columns=unique_keywords)

# Export to Excel
excel_file_path = 'bipartite_matrix.xlsx'
df_matrix.to_excel(excel_file_path)

print(f"The bipartite matrix has been saved to {excel_file_path}")


The bipartite matrix has been saved to bipartite_matrix.xlsx
