In [None]:
import requests
from pyunpack import Archive
import os

dir_path = 'Dokaz'

def create_directory(directory_path):
    try:
        os.makedirs(directory_path, exist_ok=True)
        print(f"Directory '{directory_path}' created or already exists.")
    except Exception as e:
        print(f"Failed to create directory '{directory_path}': {e}")

def download_file_with_custom_headers(url, local_filename):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    with requests.get(url, headers=headers, stream=True) as r:
        r.raise_for_status()  # This will throw an exception for non-200 responses
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print("File downloaded successfully.")

# Example usage
file_url = 'https://github.com/toperkov/RF-FESB-2023-24/blob/main/lab2/Download/Dokaz.zip'
output_dir = './Dokaz.zip'
download_file_with_custom_headers(file_url, output_dir)

create_directory(dir_path)
Archive('Dokaz-zip').extractall("Dokaz")

In [1]:
import os
import pandas as pd

# specify the directory path where the files are located
dir_path = r'C:\Users\A507\RF\lcupic\Dokaz'

# create an empty list to store the file names
file_names = []

# iterate through all files in the directory
for file in os.listdir(dir_path):
    # check if the file is a regular file (i.e., not a directory)
    if os.path.isfile(os.path.join(dir_path, file)):
        # if so, add the file name to the list
        file_names.append(file)

# create a Pandas dataframe with the file names
df = pd.DataFrame({'file_name': file_names})

# print the dataframe   
print(df)

             file_name
0   Secret_file_11.txt
1   Secret_file_12.pdf
2   Secret_file_22.png
3  Secret_file_48.docx
4   Secret_file_49.pdf
5   Secret_file_52.jpg
6  Secret_file_72.docx
7   Secret_file_92.jpg


In [2]:
import os
import pandas as pd

# specify the directory path where the files are located
dir_path = r'C:\Users\A507\RF\lcupic\Dokaz'

# create an empty list to store the file names
file_names = []
extensions = []

# iterate through all files in the directory
for file in os.listdir(dir_path):
    # check if the file is a regular file (i.e., not a directory)
    if os.path.isfile(os.path.join(dir_path, file)):
        # if so, add the file name to the list
        radzvojeni_file = os.path.splitext(file)
        file_names.append(file)
        extensions.append(radzvojeni_file[1])

# create a Pandas dataframe with the file names
df = pd.DataFrame({'file_name': file_names})
df_ext = pd.DataFrame({'extensions': extensions})

# print the dataframe   
print(df)
print(df_ext)

             file_name
0   Secret_file_11.txt
1   Secret_file_12.pdf
2   Secret_file_22.png
3  Secret_file_48.docx
4   Secret_file_49.pdf
5   Secret_file_52.jpg
6  Secret_file_72.docx
7   Secret_file_92.jpg
  extensions
0       .txt
1       .pdf
2       .png
3      .docx
4       .pdf
5       .jpg
6      .docx
7       .jpg


In [3]:
import os
import pandas as pd
import hashlib
import magic
import mimetypes

def file2hash(filename, hash_function):
    with open(filename, 'rb', buffering=0) as f:
        return hash_function(f.read()).hexdigest()

dir_path = 'C:\\Users\\A507\\RF\\lcupic\\Dokaz'

# create an empty list to store the file names
file_names = []
extensions = []
md5s = []
sha1s = []
sha256s = []
magic_numbers = []
magic_object = magic.Magic(mime=True)
original_extensions = []
extension_matches = []

# iterate through all files in the directory
for file in os.listdir(dir_path):
    # check if the file is a regular file (i.e., not a directory)
    path = os.path.join(dir_path, file)
    if os.path.isfile(os.path.join(dir_path, file)):
        # if so, add the file name to the list
        file_names.append(os.path.splitext(file)[0])
        extensions.append(os.path.splitext(file)[1])
        md5s.append(file2hash(path, hashlib.md5))
        sha1s.append(file2hash(path, hashlib.sha1))
        sha256s.append(file2hash(path, hashlib.sha256))
        magic_number = magic_object.from_file(path)
        magic_numbers.append(magic_number)
        extension = os.path.splitext(file)[1]
        original_extensions.append(mimetypes.guess_all_extensions(magic_number.lower())[0])
        if extension.lower() == '':
            extension_matches.append(False)
        elif mimetypes.guess_type('test'+extension.lower())[0] in magic_number.lower():
            extension_matches.append(True)
        else:
            extension_matches.append(False)

# create a Pandas dataframe with the file names
df = pd.DataFrame({'file_name' : file_names, 'extension' : extensions, 'md5' : md5s, 'sha1' : sha1s, 'sha256' : sha256s, 'magic_number' : magic_numbers, 'original_extensions' : original_extensions})

# print the dataframe
print(df)
print(extension_matches)

        file_name extension                               md5  \
0  Secret_file_11      .txt  187b0341b64a0e16d74ea3b50e22e6c2   
1  Secret_file_12      .pdf  a0c34dd882f1e7e5af2341b1aadb532c   
2  Secret_file_22      .png  40bba5dd7b99c5e99d0877993ec3f3fb   
3  Secret_file_48     .docx  7a2fdf83ebce571af7fcb93a8825ead1   
4  Secret_file_49      .pdf  40bba5dd7b99c5e99d0877993ec3f3fb   
5  Secret_file_52      .jpg  8de0128acae226c00efb5f98a9486e68   
6  Secret_file_72     .docx  098cb2f52dda9e3195075d765a9ff01f   
7  Secret_file_92      .jpg  4266ea20320c598bc5d7b1d731e3e9e9   

                                       sha1  \
0  08152ff79a16f2a181174eb5fe8de0197f4d722f   
1  55afda15a501e69ee9bf94fd26e9d74239127e01   
2  19b85cc663198f1145078e17dc4af9966b4b9e86   
3  e6d6de3bc2c41cccc1d71a27ec175b0bf2c73e20   
4  19b85cc663198f1145078e17dc4af9966b4b9e86   
5  c15e32d27635f248c1c8b66bb012850e5b342119   
6  fb5184f27b5780c74714d89f6c43da0c48e8cd3a   
7  1a30a1d139131e7105a1896d7719c85c030