# Data analysis

In [1]:
import pandas as pd
import os
import numpy as np
from collections import Counter
import zipfile


%load_ext autoreload
%autoreload 2
%autosave 180

Autosaving every 180 seconds


## Extract Data 

In [2]:
# Path to your zip file
zip_path_sp = 'sp.zip' 
zip_path_nsp = 'no_sp.zip'

# Function to extract the first and second columns from the csvs in a zip file
def extract_columns_from_zip(zip_path):
    data_dict = {}
    
    with zipfile.ZipFile(zip_path, 'r') as z:
        for file_index, file_name in enumerate(z.namelist()):
            if file_name.endswith('.csv'):
                with z.open(file_name) as f:
                    # Read the CSV without headers
                    df = pd.read_csv(f, header=None)
                    # Extract first and second columns
                    col1 = df[0].tolist()
                    col2 = df[1].tolist()
                    # Store the columns in the dictionary
                    data_dict[file_index] = (col1, col2)
                    
    return data_dict

# Extract data for both folders
data_SP = extract_columns_from_zip(zip_path_sp)
data_NSP = extract_columns_from_zip(zip_path_nsp)

# Print results
print("Folder 1 Data:")
for idx, (col1, col2) in data_SP.items():
    print(f"File {idx}: Column 1 - {col1[:5]}, Column 2 - {col2[:5]}...")  # Display first 5 values as a sample

print("\nFolder 2 Data:")
for idx, (col1, col2) in data_NSP.items():
    print(f"File {idx}: Column 1 - {col1[:5]}, Column 2 - {col2[:5]}...")  # Display first 5 values as a sample



Folder 1 Data:
File 1: Column 1 - [5779.0, 32527.0, 28702.0, 11029.0, 24144.0], Column 2 - [0.0895118862390518, 0.08010184019804, 0.06753815710544586, 0.06464803963899612, 0.06243804469704628]...
File 2: Column 1 - [30101.0, 9761.0, 18357.0, 25610.0, 14527.0], Column 2 - [0.1050957813858986, 0.092267245054245, 0.07321581244468689, 0.07181081920862198, 0.06674166768789291]...
File 3: Column 1 - [30101.0, 25610.0, 18357.0, 9761.0, 29451.0], Column 2 - [0.12267198413610458, 0.10074727982282639, 0.08535652607679367, 0.085012286901474, 0.0660361722111702]...
File 4: Column 1 - [28702.0, 23930.0, 9761.0, 31699.0, 30101.0], Column 2 - [0.11034052073955536, 0.08050413429737091, 0.07631363719701768, 0.07559821009635925, 0.07525978982448579]...
File 5: Column 1 - [30101.0, 25610.0, 28702.0, 18357.0, 23930.0], Column 2 - [0.12226267904043196, 0.08573133498430253, 0.08441074192523955, 0.08217068761587144, 0.061022602021694176]...
File 6: Column 1 - [28702.0, 9761.0, 23930.0, 30101.0, 8253.0], Colu

In [3]:
# print(folder1_data)

In [4]:
# Function to get repeated activation values 
def find_repeated_values(data_dict):
    # Flatten column 1 across all files
    all_values = []
    for col1, col2 in data_dict.values():
        all_values.extend(col1)  # Add all values from column 1
        
    # Count occurrences of each value
    value_counts = Counter(all_values)
    
    # Find repeated values (those with a count greater than 1)
    repeated_values = {val: count for val, count in value_counts.items() if count > 49}
    
    return repeated_values

# Find repeated values in column 1 for both data sets
repeated_SP = find_repeated_values(data_SP)
repeated_NSP = find_repeated_values(data_NSP)

# Print the repeated values in both data sets
print("Repeated values in data_SP (column 1):")
print(repeated_SP)

print("\nRepeated values in data_NSP (column 1):")
print(repeated_NSP)

Repeated values in data_SP (column 1):
{32527.0: 50, 23930.0: 50, 31699.0: 50, 7140.0: 50, 9761.0: 50, 23336.0: 50, 27560.0: 50, 2625.0: 50, 8253.0: 50, 6301.0: 50, 10621.0: 50, 9154.0: 50, 31298.0: 50, 20138.0: 50, 21681.0: 50, 27100.0: 50, 4955.0: 50, 31878.0: 50, 5987.0: 50, 29798.0: 50, 12357.0: 50, 13043.0: 50, 10966.0: 50, 30767.0: 50, 22631.0: 50, 25338.0: 50, 23105.0: 50, 27719.0: 50, 28078.0: 50, 23428.0: 50, 21701.0: 50, 22666.0: 50, 12633.0: 50, 25355.0: 50, 11488.0: 50, 7584.0: 50, 20673.0: 50, 9682.0: 50, 2826.0: 50, 3001.0: 50, 7142.0: 50, 41.0: 50, 24814.0: 50, 16086.0: 50, 26432.0: 50, 2772.0: 50, 8569.0: 50, 18187.0: 50, 13519.0: 50, 6434.0: 50, 14283.0: 50, 26738.0: 50, 23086.0: 50, 14731.0: 50, 10180.0: 50, 3775.0: 50, 2542.0: 50, 21550.0: 50, 29467.0: 50, 5730.0: 50, 15890.0: 50, 4011.0: 50, 27599.0: 50, 21155.0: 50, 21397.0: 50, 24366.0: 50, 14530.0: 50, 17931.0: 50, 27828.0: 50, 11718.0: 50, 19399.0: 50, 32082.0: 50, 30433.0: 50, 2002.0: 50, 21954.0: 50, 11759.0: 

In [22]:
# Function to compare repeated values between the two datasets
def compare_repeated_values(repeated_SP, repeated_NSP):
    # Find common values between repeated_SP and repeated_NSP
    common_values = set(repeated_SP.keys()).intersection(repeated_NSP.keys())
    
    return common_values

# Compare repeated values between both datasets
common_repeated_values = compare_repeated_values(repeated_SP, repeated_NSP)

# Print the common repeated values
print("Common repeated values in column 1 between data_SP and data_NSP:")
print(common_repeated_values)

Common repeated values in column 1 between data_SP and data_NSP:
{23428.0, 31878.0, 22666.0, 18187.0, 25355.0, 2826.0, 14731.0, 15890.0, 6434.0, 28078.0, 30767.0, 23086.0, 3775.0, 26432.0, 2625.0, 9154.0, 23105.0, 20673.0, 21701.0, 14530.0, 14283.0, 13519.0, 9682.0, 12633.0, 27100.0, 11488.0, 29798.0, 2542.0, 13043.0, 8569.0}
