In [1]:
!pip install PyMuPDF
!pip install nltk
!pip install pandas
!pip install spacy
!python -m spacy download pl_core_news_lg
!pip install pillow 
!pip install sklearn
!pip install pandas openpyxl
!pip install pdfminer.six spacy nltk scikit-learn pytesseract pillow pandas openpyxl python-docx
!pip install scikit-learn

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9
Collecting pl-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.7.0/pl_core_news_lg-3.7.0-py3-none-any.whl (573.7 MB)
[2K     [90m━━━━

In [None]:
import pandas as pd
import fitz
import os
import io
import pytesseract
from PIL import Image
import spacy
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re

nlp = spacy.load("pl_core_news_lg")

def remove_hyphenation(text):
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    text = text.replace('\n', ' ')
    return text

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                try:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image = Image.open(io.BytesIO(image_bytes))
                    text += pytesseract.image_to_string(image, lang='pol')
                except Exception:
                    continue
    except Exception:
        pass 
    text = remove_hyphenation(text)
    text = clean_text(text)
    return text.lower()

def extract_text_from_txt(txt_path):
    text = ""
    try:
        with open(txt_path, 'r', encoding='utf-8') as file:
            text = file.read()
    except Exception:
        pass
    text = remove_hyphenation(text)
    text = clean_text(text)
    return text.lower()

def lemmatize_text(text):
    try:
        doc = nlp(text)
        lemmas = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
        return ' '.join(lemmas), lemmas
    except Exception:
        return "", []

def process_file(file_path):
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith('.txt'):
        text = extract_text_from_txt(file_path)
    else:
        return "", "", []
    
    text = text.replace('\n', ' ').replace('\n\n', ' ')
    lemmatized_text, lemmas = lemmatize_text(text)
    return text, lemmatized_text, lemmas

directory_path = '/kaggle/input/testowe-strategie-konrad'
xlsx_path = '/kaggle/input/branza-r/brana_nazwa_rozsz.xlsx' 
df = pd.read_excel(xlsx_path, engine='openpyxl')

def lemmatize_nazwa_column(df):
    df['nazwa_lematyzowana'], df['lemmas'] = zip(*df['nazwa'].apply(lambda x: lemmatize_text(x) if isinstance(x, str) else (x, [])))
    return df

df = lemmatize_nazwa_column(df)
all_results = []
extracted_words = []
lemmatized_words = []

for file_name in os.listdir(directory_path):
    if file_name.endswith('.pdf') or file_name.endswith('.txt'):
        file_path = os.path.join(directory_path, file_name)
        extracted_text, lemmatized_text, lemmas = process_file(file_path)
        
        if lemmatized_text:
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform([lemmatized_text] + df['nazwa_lematyzowana'].tolist())
            similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
            temp_df = pd.DataFrame({'file_name': file_name, 'branża': df['branża'], 'match_score': similarity_matrix[0]})
            data = temp_df[['match_score']].values
            scaler = StandardScaler()
            data_scaled = scaler.fit_transform(data)
            dbscan = DBSCAN(eps=0.5, min_samples=2)
            clusters = dbscan.fit_predict(data_scaled)
            temp_df['cluster'] = clusters
            
            cluster_mean_scores = temp_df.groupby(['file_name', 'cluster'])['match_score'].mean().reset_index()
            cluster_mean_scores.columns = ['file_name', 'cluster', 'mean_match_score']
            min_mean_scores = cluster_mean_scores.groupby('file_name')['mean_match_score'].transform('min')
            cluster_mean_scores['is_min'] = cluster_mean_scores['mean_match_score'] == min_mean_scores
            filtered_clusters = cluster_mean_scores[~cluster_mean_scores['is_min']]
            final_results = temp_df.merge(filtered_clusters[['file_name', 'cluster']], on=['file_name', 'cluster'], how='inner')
            all_results.append(final_results)
            extracted_words.extend([(file_name, word) for word in extracted_text.split()])
            lemmatized_words.extend([(file_name, lemma) for lemma in lemmas])

all_results_df = pd.concat(all_results, ignore_index=True)
extracted_words_df = pd.DataFrame(extracted_words, columns=['file_name', 'word'])
lemmatized_words_df = pd.DataFrame(lemmatized_words, columns=['file_name', 'lemma'])
extracted_words_df.to_csv('extracted_words.csv', index=False)
lemmatized_words_df.to_csv('lemmatized_words.csv', index=False)
branche_count = all_results_df.groupby('file_name')['branża'].nunique().reset_index(name='branche_count')
plt.figure(figsize=(10, 6))
plt.hist(branche_count['branche_count'], bins=20, edgecolor='k', alpha=0.7)
plt.xlabel('Liczba branż')
plt.ylabel('Liczba gmin')
plt.title('Histogram liczby branż w gminach po filtracji')
plt.grid(axis='y', alpha=0.75)
plt.show()
all_results_df.to_csv('filtered_clusters_results_all_files.csv', index=False)


MuPDF error: syntax error: invalid key in dict

MuPDF error: format error: non-page object in page tree

MuPDF error: syntax error: expected 'obj' keyword (1211 0 ?)

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: synt

In [9]:
import pandas as pd
import numpy as np

gminy = all_results_df['file_name'].unique()
branze = df['branża'].unique()

macierz = pd.DataFrame(0, index=gminy, columns=branze)

for _, row in all_results_df.iterrows():
    gmina = row['file_name']
    branza = row['branża']
    macierz.loc[gmina, branza] = 1

macierz = macierz.sort_index()

macierz.to_csv('macierz_gminy_branze.csv')

print(macierz.head())


             1.11   1.12   1.13   1.14   1.15   1.16   1.19   1.21   1.22   \
0208133.pdf      0      0      0      0      0      0      0      0      0   
0209073.pdf      0      0      0      0      0      0      0      0      0   
0210011.pdf      0      0      0      0      0      0      0      0      0   
0210021.pdf      0      0      0      0      0      0      0      0      0   
0210033.pdf      0      0      0      0      0      0      0      0      0   

             1.23   ...  95.29  96.01  96.02  96.03  96.04  96.09  97.00  \
0208133.pdf      0  ...      0      0      0      0      0      0      0   
0209073.pdf      0  ...      0      0      0      0      0      0      0   
0210011.pdf      0  ...      0      0      0      0      0      0      0   
0210021.pdf      0  ...      0      0      0      0      0      0      0   
0210033.pdf      0  ...      0      0      0      0      0      0      0   

             98.10  98.20  99.00  
0208133.pdf      0      0      0  
0209

In [8]:
import pandas as pd
import numpy as np

gminy = all_results_df['file_name'].unique()
branze = all_results_df['branża'].unique()
macierz = pd.DataFrame(0, index=gminy, columns=branze)

for _, row in all_results_df.iterrows():
    gmina = row['file_name']
    branza = row['branża']
    macierz.loc[gmina, branza] = 1

macierz = macierz.sort_index()
macierz.to_csv('macierz_gminy_branze.csv')
print(macierz.head())



             20.30  23.44  62.02  64.30  66.30  70.22  71.11  84.11  84.12  \
0208133.pdf      0      0      0      0      0      0      1      1      1   
0209073.pdf      0      0      0      0      0      0      1      1      1   
0210011.pdf      0      0      0      0      0      0      0      0      0   
0210021.pdf      0      0      0      0      0      0      1      0      1   
0210033.pdf      0      0      0      0      0      0      0      0      0   

             84.13  ...  98.20  17.22  20.17  32.11  77.34  55.20  22.23  \
0208133.pdf      1  ...      0      0      0      0      0      0      0   
0209073.pdf      1  ...      0      0      0      0      0      0      0   
0210011.pdf      0  ...      0      0      1      1      0      0      1   
0210021.pdf      0  ...      0      0      0      0      0      0      0   
0210033.pdf      0  ...      0      0      0      0      0      0      0   

             25.29  25.91  25.92  
0208133.pdf      0      0      0  
0209