### Imports

In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    print("Using CPU")


### Basic approach - up to 10k elements

In [None]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE") 
)

# Define the query to match documents where 'site' is an empty string
query = {
    "query": {
        "term": {
            "site": "" 
        }
    }
}

# Execute the search
response = client.search(
    index="insightful-fragments-v8",
    size=10000,  # Adjust the size as needed to pull all documents (consider using scroll for larger datasets)
    body=query  # Use 'body' to pass the query
)

# Extract the documents where 'site' is an empty string
documents_with_empty_site = [doc['_source'] for doc in response['hits']['hits']]

# Print or process the results
for doc in documents_with_empty_site:
    print(doc['app'])  # Print the app name or process as needed


### Complex compositions

In [None]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE") 
)

# Define the aggregation query to get unique 'app' values where 'site' is not empty
query = {
    "size": 0,
    "query": {
        "bool": {
            "must": [
                {"exists": {"field": "site"}},
                {"wildcard": {"site": "*?"}} 
            ]
        }
    },
    "aggs": {
        "unique_apps": {
            "terms": {
                "field": "app.keyword",
                "size": 10000
            }
        }
    }
}

# Execute the search with aggregation
response = client.search(index="insightful-fragments-v8", body=query)

# Extract the unique apps from the aggregation results
unique_apps = [bucket['key'] for bucket in response['aggregations']['unique_apps']['buckets']]

# Print or process the unique apps
for app in unique_apps:
    print(app)


In [None]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE") 
)

# Initialize an empty list to store all unique apps
unique_apps = []
after_key = None  # For pagination

# Loop through composite aggregation to collect all unique 'app' values
while True:
    # Define the composite aggregation query with pagination
    query = {
        "size": 0,
        "aggs": {
            "unique_apps": {
                "composite": {
                    "sources": [
                        {"app": {"terms": {"field": "app.keyword"}}}
                    ],
                    "size": 10000  # Adjust size per page as needed
                }
            }
        }
    }
    
    # Include 'after' key only if we have a valid after_key value
    if after_key:
        query["aggs"]["unique_apps"]["composite"]["after"] = after_key

    # Execute the search with aggregation
    response = client.search(index="insightful-fragments-v8", body=query)

    # Extract the unique apps from the aggregation results
    buckets = response['aggregations']['unique_apps']['buckets']
    unique_apps.extend([bucket['key']['app'] for bucket in buckets])

    # Check if there is more data to fetch; if not, break the loop
    if 'after_key' in response['aggregations']['unique_apps']:
        after_key = response['aggregations']['unique_apps']['after_key']
    else:
        break

# Print or process the unique apps
for app in unique_apps:
    print(app)

# Optionally, remove duplicates if necessary
unique_apps = list(set(unique_apps))  # Converts to a set and back to list for deduplication if needed


In [None]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE") 
)

# Initialize an empty list to store all unique apps
unique_apps = []
after_key = None  # For pagination

# Loop through composite aggregation to collect all unique 'app' values
while True:
    # Define the composite aggregation query with pagination
    query = {
        "size": 0,
        "aggs": {
            "unique_sites": {
                "composite": {
                    "sources": [
                        {"site": {"terms": {"field": "site.keyword"}}}
                    ],
                    "size": 10000  # Adjust size per page as needed
                }
            }
        }
    }
    
    # Include 'after' key only if we have a valid after_key value
    if after_key:
        query["aggs"]["unique_sites"]["composite"]["after"] = after_key

    # Execute the search with aggregation
    response = client.search(index="insightful-fragments-v8", body=query)

    # Extract the unique apps from the aggregation results
    buckets = response['aggregations']['unique_sites']['buckets']
    unique_apps.extend([bucket['key']['site'] for bucket in buckets])

    # Check if there is more data to fetch; if not, break the loop
    if 'after_key' in response['aggregations']['unique_sites']:
        after_key = response['aggregations']['unique_sites']['after_key']
    else:
        break

# Optionally, remove duplicates if necessary
unique_apps = list(set(unique_apps))  # Converts to a set and back to list for deduplication if needed


In [None]:
import pandas as pd

# Convert the list to a DataFrame with a column name "browsers"
df = pd.DataFrame(unique_apps, columns=["apps"])

# Save the DataFrame to a CSV file
df.to_csv("apps.csv", index=False, sep=';')


### Scrolling approach - All data pulling

In [None]:
from elasticsearch import Elasticsearch
import time

# Connect to the Elasticsearch cluster
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE")
)

# To store the pulled data
all_documents = []

# Start the initial search request with scroll
scroll_size = 10000
scroll_time = '2m'
iteration = 0
total_docs = 0
batch_size = 20

start_time = time.time()

# Initial search query with scroll
response = client.search(
    index="insightful-fragments-v8",
    size=scroll_size,
    scroll=scroll_time,
    query={"match_all": {}}
)

# Extract the first scroll ID and documents
scroll_id = response['_scroll_id']
hits = response['hits']['hits']
total_docs += len(hits)

# Add the first batch of documents to the list
all_documents.extend([doc['_source'] for doc in hits])

# Continue scrolling to pull all data
while len(hits) > 0:
    iteration += 1
    # Fetch the next batch of results using the scroll ID
    response = client.scroll(scroll_id=scroll_id, scroll=scroll_time)
    
    # Update scroll ID and retrieve the next batch of hits
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']
    total_docs += len(hits)
    
    # Add the new batch of documents to the list
    all_documents.extend([doc['_source'] for doc in hits])
    
    # Print progress every 20 iterations
    if iteration % batch_size == 0:
        time_spent = time.time() - start_time
        print(f"Iteration: {iteration}, Total documents pulled: {total_docs}")
        print(f"Time spent for last 20 iterations: {time_spent:.2f} seconds")
        # Reset the timer after each 20 iterations
        start_time = time.time()

# Once all data is retrieved, clear the scroll context
client.clear_scroll(scroll_id=scroll_id)

print(f"All documents pulled. Total: {total_docs}")


### Scrolling approach - pulling by sorted employeeId

In [None]:
from elasticsearch import Elasticsearch
import time

# Connect to the Elasticsearch cluster
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE")
)

# To store the pulled data
all_documents = []

# Start the initial search request with scroll
scroll_size = 10000
scroll_time = '2m'
iteration = 0
total_docs = 0
batch_size = 20

start_time = time.time()

# Set the threshold for the number of unique employeeIds to pull data for
max_employee_ids_to_pull = 150 

# Track the current employeeId to ensure we finish pulling all documents for that employeeId
current_employee_id = None
unique_employee_ids = set()

# Initial search query with scroll, sorted by employeeId
response = client.search(
    index="insightful-fragments-v8",
    size=scroll_size,
    scroll=scroll_time,
    query={"match_all": {}},
    sort=[{"employeeId": {"order": "asc"}}]
)

# Extract the first scroll ID and documents
scroll_id = response['_scroll_id']
hits = response['hits']['hits']

# Continue scrolling to pull all data until we've pulled for the desired number of unique employeeIds
while len(hits) > 0:
    iteration += 1
    for hit in hits:
        employee_id = hit['_source'].get('employeeId')

        # If we haven't reached the max_employee_ids_to_pull, add documents
        if len(unique_employee_ids) < max_employee_ids_to_pull or (current_employee_id == employee_id):
            # If this is a new employeeId, add it to the set
            if employee_id not in unique_employee_ids:
                unique_employee_ids.add(employee_id)
                current_employee_id = employee_id
            
            # Add document to the list
            all_documents.append(hit['_source'])
            total_docs += 1
        else:
            break 

    # Fetch the next batch of results using the scroll ID
    response = client.scroll(scroll_id=scroll_id, scroll=scroll_time)
    
    # Update scroll ID and retrieve the next batch of hits
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    # Print progress every 20 iterations
    if iteration % batch_size == 0:
        time_spent = time.time() - start_time
        print(f"Iteration: {iteration}, Total documents pulled: {total_docs}")
        print(f"Time spent for last 20 iterations: {time_spent:.2f} seconds")
        print(f"Unique employeeIds processed: {len(unique_employee_ids)}")
        # Reset the timer after each 20 iterations
        start_time = time.time()

    # If we've collected data for the desired number of unique employeeIds, stop the loop
    if len(unique_employee_ids) >= max_employee_ids_to_pull:
        break

# Once all data is retrieved or the target number of employeeIds is met, clear the scroll context
client.clear_scroll(scroll_id=scroll_id)

print(f"All documents pulled. Total: {total_docs}")
print(f"Total unique employeeIds pulled: {len(unique_employee_ids)}")


### Pulling one whole specific team from db

In [None]:
from elasticsearch import Elasticsearch
import time

# Connect to the Elasticsearch cluster
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE")
)

# To store the pulled data
all_documents = []

# Start the initial search request with scroll
scroll_size = 10000
scroll_time = '2m'
iteration = 0
total_docs = 0
batch_size = 20

start_time = time.time()

# Specify the teamId to filter data
target_team_id = "wxqdcesicguzlsh"

# Initial search query with scroll, filtering by teamId
response = client.search(
    index="insightful-fragments-v8",
    size=scroll_size,
    scroll=scroll_time,
    query={
        "term": {
            "teamId.keyword": target_team_id
        }
    }
)

# Extract the first scroll ID and documents
scroll_id = response['_scroll_id']
hits = response['hits']['hits']

# Continue scrolling to pull all data for the specified teamId
while len(hits) > 0:
    iteration += 1
    for hit in hits:
        # Add document to the list
        all_documents.append(hit['_source'])
        total_docs += 1

    # Fetch the next batch of results using the scroll ID
    response = client.scroll(scroll_id=scroll_id, scroll=scroll_time)
    
    # Update scroll ID and retrieve the next batch of hits
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    # Print progress every 20 iterations
    if iteration % batch_size == 0:
        time_spent = time.time() - start_time
        print(f"Iteration: {iteration}, Total documents pulled: {total_docs}")
        print(f"Time spent for last {batch_size} iterations: {time_spent:.2f} seconds")
        # Reset the timer after each batch
        start_time = time.time()

# Once all data is retrieved, clear the scroll context
client.clear_scroll(scroll_id=scroll_id)

print(f"All documents pulled. Total: {total_docs}")


In [None]:
from elasticsearch import Elasticsearch

client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE")
)

index_name = "insightful-fragments-v8"
mapping = client.indices.get_mapping(index=index_name)
print(mapping)


In [None]:
response = client.search(
    index="insightful-fragments-v8",
    query={
        "match_all": {}
    },
    size=5
)

for hit in response['hits']['hits']:
    print(hit['_source'])


In [None]:
from elasticsearch import Elasticsearch
import time

# Connect to the Elasticsearch cluster
client = Elasticsearch(
    "http://192.168.5.188:9200/", 
    basic_auth=("despot.markovic", "KJmhyg7cPxfVG84UTuE")
)

# Store pulled documents
all_documents = []

# Scroll parameters
scroll_size = 10000
scroll_time = '2m'

# Specify the teamId to filter data
target_team_id = "wxqdcesicguzlsh"

# Start timing
start_time = time.time()

# Initial search query with scroll
response = client.search(
    index="insightful-fragments-v8",
    size=scroll_size,
    scroll=scroll_time,
    query={
        "term": {
            "teamId": target_team_id
        }
    }
)

# Extract the first scroll ID and documents
scroll_id = response['_scroll_id']
hits = response['hits']['hits']

# Continue scrolling until no more hits
total_docs = 0
iteration = 0
batch_size = 20

while len(hits) > 0:
    iteration += 1
    for hit in hits:
        # Append each document to the list
        all_documents.append(hit['_source'])
        total_docs += 1

    # Fetch the next batch of results using the scroll ID
    response = client.scroll(scroll_id=scroll_id, scroll=scroll_time)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    # Log progress every 20 iterations
    if iteration % batch_size == 0:
        time_spent = time.time() - start_time
        print(f"Iteration: {iteration}, Total documents pulled: {total_docs}")
        print(f"Time spent for last {batch_size} iterations: {time_spent:.2f} seconds")
        # Reset the timer
        start_time = time.time()

# Clear scroll context
client.clear_scroll(scroll_id=scroll_id)

# Final output
print(f"All documents pulled. Total: {total_docs}")


### Convert list of dicts to pd df

In [None]:
import pandas as pd

# Convert the list of dicts into a DataFrame
df = pd.DataFrame(all_documents)

In [None]:
# Print number of unique values (employeeIds)
print(f"Number of unique employees: {len(df.employeeId.unique())}")

In [None]:
# Try to save to csv file
from pathlib import Path

path = Path("./wxqdcesicguzlsh_test_dataset.csv")

with open(path, 'w') as fp:
    df.to_csv(path, sep=';', encoding='utf-8', errors='replace')

### Loading Datasets

In [116]:
from pathlib import Path
import pandas as pd

# Testing load csv file:
path = Path("data/150_users_dataset.csv")

with open(path, 'r', encoding='utf-8') as fp:
    df = pd.read_csv(fp, sep=';')

  df = pd.read_csv(fp, sep=';')


### Loading mappings for sites/apps

In [None]:
df['id']

0          aa2c26a1-09dc-4f59-9e56-62e808e53a57
1          8f4a1817-0ad9-484e-a079-b87f99e3d6e5
2          e302822a-d3c5-44a3-ba1e-a6fc1035ebef
3          a8a89171-49aa-4ca9-b7cc-20637b610673
4          3f921b66-3efa-4818-ba49-aa690ef00d93
                           ...                 
9439823    07832fa2-29c4-4544-a37c-49858a253f71
9439824    28652618-c97a-4180-8185-fbf5d41fd1a9
9439825    628240e0-6533-429e-a5eb-091e27e185ff
9439826    d9ec5deb-f44d-44e0-b12d-443f4be48a56
9439827    71934f5e-64cb-43e9-a967-e8502bf1663d
Name: id, Length: 9439828, dtype: object

: 

In [75]:
import json

# Testing load csv file:
path_apps = Path("mappings/app_mappings_2st_round.csv")
path_sites = Path("mappings/site_mappings_3rd_round_.csv")
path_browsers = Path("mappings/browsers.csv")
exclude_mappings = Path("mappings/exclude_mappings.json")

with open(path_apps, 'r', encoding='utf-8') as fp:
    mappings_apps = pd.read_csv(fp)

with open(path_sites, 'r', encoding='utf-8') as fp:
    mappings_sites = pd.read_csv(fp)

with open(path_browsers, 'r', encoding='utf-8') as fp:
    browsers = pd.read_csv(fp, sep=';')

with open(exclude_mappings, 'r', encoding='utf-8') as fp:
    exclude_mappings = json.load(fp)


In [76]:
mappings_apps['app_mapping_v2'] = mappings_apps['app_mapping_v2'] + "-Local"

# Drop rows where 'site' is NaN
mappings_sites = mappings_sites.dropna(subset=['site'])


### Data processing / agregations

In [77]:
from pathlib import Path
import pandas as pd

unique_apps = browsers.browsers.unique()

# 1. Adding 'Concentration Lost' where app is inactive
df.loc[df['active'] == False, 'app'] = 'Concentration Lost'

# 2. Convert 'start' and 'end' timestamps to datetime and sort the DataFrame
df['start_time'] = pd.to_datetime(df['start'], unit='ms')
df['end_time'] = pd.to_datetime(df['end'], unit='ms')
df = df.sort_values(by=['employeeId', 'start_time'])

# 3. Rewrite 'app' as 'Private Links' where 'app' is in unique_apps and 'site' is NaN
df.loc[df['app'].isin(unique_apps) & df['site'].isna(), 'app'] = 'Private Links'

# Keep only necessary columns
df = df[['employeeId', 'app', 'site', 'start_time', 'end_time', 'mouseClicks', 'keystrokes', 'mic', 'mouseScroll', 'camera']]


# Display the processed DataFrame
df.head()


Unnamed: 0,employeeId,app,site,start_time,end_time,mouseClicks,keystrokes,mic,mouseScroll,camera
50099,w--4rnszliaxy12,Workpuls,,2024-09-02 09:23:12.586,2024-09-02 09:23:12.839,0,0,False,0.0,False
50100,w--4rnszliaxy12,Workpuls,,2024-09-02 09:23:14.200,2024-09-02 09:23:18.272,1,0,False,0.0,False
50101,w--4rnszliaxy12,Windows Explorer,,2024-09-02 09:23:18.272,2024-09-02 09:23:20.063,0,0,False,0.0,False
50102,w--4rnszliaxy12,Private Links,,2024-09-02 09:23:20.063,2024-09-02 09:23:26.523,3,0,False,0.0,False
50103,w--4rnszliaxy12,Google Chrome,docs.google.com,2024-09-02 09:23:26.523,2024-09-02 09:23:26.878,1,0,False,0.0,False


### Mappings sites/apps

In [12]:
# Step 1: Ensure excluded sites map to themselves in mappings_sites
mappings_sites.loc[mappings_sites['site'].isin(exclude_mappings['sites']), 'site_mapping'] = mappings_sites['site']

# Map site values in df using mappings_sites
df = df.merge(mappings_sites[['site', 'site_mapping']], on='site', how='left')
# Replace app values in df with site_mapping where mapping exists
df['app'] = df['site_mapping'].combine_first(df['app'])
# Drop the temporary site_mapping column
df.drop(columns=['site_mapping'], inplace=True)

# Step 2: Ensure excluded apps map to themselves in mappings_apps
mappings_apps.loc[mappings_apps['app'].isin(exclude_mappings['apps']), 'app_mapping_v2'] = mappings_apps['app']

# Map app values in df using mappings_apps
df = df.merge(mappings_apps[['app', 'app_mapping_v2']], on='app', how='left')
# Replace app values in df with app_mapping_v2 where mapping exists
df['app'] = df['app_mapping_v2'].combine_first(df['app'])
# Drop the temporary app_mapping_v2 column
df.drop(columns=['app_mapping_v2'], inplace=True)

# # Keep only necessary columns in the final DataFrame
# df = df[['employeeId', 'app', 'start_time', 'end_time']]

# Replace multiple spaces with a single underscore in the 'app' column
df['app'] = df['app'].str.replace(r'\s+', '_', regex=True)

In [13]:
# Sort by 'employeeId' and 'start_time' to ensure chronological order
df = df.sort_values(by=['employeeId', 'start_time']).reset_index(drop=True)

In [None]:
df = pd.read_csv('ProcessingPipe/data/processed_data_v2.csv')
df.head()

In [14]:
import pandas as pd

# Sort by 'employeeId' and 'start_time' to ensure chronological order
df = df.sort_values(by=['employeeId', 'start_time']).reset_index(drop=True)

# Identify where a new group should start
df['new_group'] = (
    (df['employeeId'] != df['employeeId'].shift()) |
    (df['app'] != df['app'].shift()) |        
    (df['start_time'] != df['end_time'].shift()) 
)

# Create a cumulative sum to assign group IDs
df['group_id'] = df['new_group'].cumsum()

# Group by 'employeeId' and 'group_id' and aggregate
df_merged = df.groupby(['employeeId', 'group_id'], as_index=False).agg({
    'start_time': 'first',
    'end_time': 'last',
    'app': 'first'
})

# Drop the 'group_id' and 'new_group' columns if not needed
df_merged = df_merged.drop(columns=['group_id'])

# Sort the merged DataFrame by 'start_time' to maintain chronological order
df_merged = df_merged.sort_values(by=['employeeId', 'start_time']).reset_index(drop=True)

## Delete this all vvvvvvv

In [15]:
df_merged.head()

Unnamed: 0,employeeId,start_time,end_time,app
0,w--4rnszliaxy12,2024-09-02 09:23:12.586,2024-09-02 09:23:12.839,Workpuls
1,w--4rnszliaxy12,2024-09-02 09:23:14.200,2024-09-02 09:23:18.272,Workpuls
2,w--4rnszliaxy12,2024-09-02 09:23:18.272,2024-09-02 09:23:20.063,Windows_Explorer
3,w--4rnszliaxy12,2024-09-02 09:23:20.063,2024-09-02 09:23:26.523,Private_Links
4,w--4rnszliaxy12,2024-09-02 09:23:26.523,2024-09-02 09:23:27.746,docs.google.com


In [105]:
import pandas as pd

df1 = pd.read_csv("pipeline/data/processed_data.csv")
df1 = df1.iloc[:50, :]
df1.to_csv("data/day_point_dataset_50_rows_aditional_features.csv")


In [72]:
df1.head()

Unnamed: 0,employeeId,app,app_durations,app_start_times,app_end_times,mouseClicks,keystrokes,mic,mouseScroll,camera,start_time,end_time,hours_until_next_workday,workday_duration
0,w--4rnszliaxy12_1,"['Workpuls', 'Windows_Explorer', 'Private_Link...","[0.09476666666666667, 0.029849999999999998, 0....","[Timestamp('2024-09-02 09:23:12.586000'), Time...","[Timestamp('2024-09-02 09:23:18.272000'), Time...","[1, 0, 3, 1, 2, 0, 2, 1, 1, 2, 0, 6, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",2024-09-02 09:23:12.586,2024-09-02 20:01:35.212,255.667794,638.3771
1,w--4rnszliaxy12_10,"['Workpuls', 'discord.com', 'docs.google.com',...","[0.07228333333333332, 0.27903333333333336, 0.1...","[Timestamp('2024-09-13 11:41:39.272000'), Time...","[Timestamp('2024-09-13 11:41:43.609000'), Time...","[1, 4, 1, 11, 1, 2, 3, 1, 1, 1, 2, 7, 1, 2, 0,...","[0, 8, 0, 2, 0, 0, 131, 0, 0, 0, 0, 226, 0, 67...","[False, False, False, False, False, False, Fal...","[0.0, 16.0, 0.0, 133.0, 22.0, 12.0, 3.0, 0.0, ...","[False, False, False, False, False, False, Fal...",2024-09-13 11:41:39.272,2024-09-13 20:03:15.818,15.699346,501.6091
2,w--4rnszliaxy12_11,"['Workpuls', 'Private_Links', 'docs.google.com...","[0.11863333333333334, 0.15068333333333334, 0.0...","[Timestamp('2024-09-14 11:45:13.464000'), Time...","[Timestamp('2024-09-14 11:45:20.582000'), Time...","[1, 6, 0, 1, 1, 1, 7, 6, 1, 1, 3, 3, 2, 5, 1, ...","[0, 0, 0, 2, 0, 0, 1, 3, 0, 0, 4, 2, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...","[16.0, 0.0, 0.0, 0.0, 0.0, 0.0, 18.0, 13.0, 1....","[False, False, False, False, False, False, Fal...",2024-09-14 11:45:13.464,2024-09-14 20:01:58.878,15.652016,496.7569
3,w--4rnszliaxy12_12,"['Workpuls', 'discord.com', 'docs.google.com',...","[0.06655, 0.26875, 0.06731666666666666, 0.7145...","[Timestamp('2024-09-15 11:41:06.134000'), Time...","[Timestamp('2024-09-15 11:41:10.127000'), Time...","[1, 6, 1, 6, 1, 1, 0, 0, 1, 1, 1, 17, 1, 3, 1,...","[0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, ...","[False, False, True, True, True, True, True, T...","[0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",2024-09-15 11:41:06.134,2024-09-15 20:01:12.075,13.594557,500.099017
4,w--4rnszliaxy12_13,"['Workpuls', 'Private_Links', 'discord.com', '...","[0.08503333333333334, 0.12776666666666667, 0.9...","[Timestamp('2024-09-16 09:36:52.481000'), Time...","[Timestamp('2024-09-16 09:36:57.583000'), Time...","[1, 4, 9, 4, 18, 5, 1, 2, 1, 1, 1, 1, 4, 2, 3,...","[0, 0, 63, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[False, False, False, False, False, False, Fal...","[0.0, 0.0, 6.0, 0.0, 15.0, 0.0, 0.0, 0.0, 0.0,...","[False, False, False, False, False, False, Fal...",2024-09-16 09:36:52.481,2024-09-16 20:15:06.288,15.81185,638.230117


In [65]:
df3.dtypes

employeeId                   object
app                          object
app_durations                object
app_start_times              object
app_end_times                object
start_time                   object
end_time                     object
hours_until_next_workday    float64
workday_duration            float64
dtype: object

In [66]:
df3.columns

Index(['employeeId', 'app', 'app_durations', 'app_start_times',
       'app_end_times', 'start_time', 'end_time', 'hours_until_next_workday',
       'workday_duration'],
      dtype='object')

In [68]:
import pandas as pd
from ast import literal_eval

# Sample DataFrame (assuming you already have it loaded as `df`)
# Converting columns to proper list formats

def safe_convert_to_list(val):
    try:
        return literal_eval(val) if pd.notna(val) else []
    except (ValueError, SyntaxError):
        return []

# Apply conversion to the columns that need to be lists
df3['app'] = df3['app'].apply(safe_convert_to_list)
df3['app_durations'] = df3['app_durations'].apply(safe_convert_to_list)
df3['app_start_times'] = df3['app_start_times'].apply(safe_convert_to_list)
df3['app_end_times'] = df3['app_end_times'].apply(safe_convert_to_list)

# Verify conversion
print(df.head())


        employeeId app             site              start_time  \
0  w--4rnszliaxy12  []              NaN 2024-09-02 09:23:12.586   
1  w--4rnszliaxy12  []              NaN 2024-09-02 09:23:14.200   
2  w--4rnszliaxy12  []              NaN 2024-09-02 09:23:18.272   
3  w--4rnszliaxy12  []              NaN 2024-09-02 09:23:20.063   
4  w--4rnszliaxy12  []  docs.google.com 2024-09-02 09:23:26.523   

                 end_time  mouseClicks  keystrokes    mic  mouseScroll camera  \
0 2024-09-02 09:23:12.839            0           0  False          0.0  False   
1 2024-09-02 09:23:18.272            1           0  False          0.0  False   
2 2024-09-02 09:23:20.063            0           0  False          0.0  False   
3 2024-09-02 09:23:26.523            3           0  False          0.0  False   
4 2024-09-02 09:23:26.878            1           0  False          0.0  False   

   new_group  group_id  
0       True         1  
1       True         2  
2       True         3  
3       Tr

In [69]:
df3.head()

Unnamed: 0,employeeId,app,app_durations,app_start_times,app_end_times,start_time,end_time,hours_until_next_workday,workday_duration
0,w--4rnszliaxy12_1,"[Workpuls, Windows_Explorer, Private_Links, do...","[0.09476666666666667, 0.029849999999999998, 0....",[],[],2024-09-02 09:23:12.586,2024-09-02 20:01:35.212,255.667794,638.3771
1,w--4rnszliaxy12_10,"[Workpuls, discord.com, docs.google.com, Misce...","[0.07228333333333332, 0.27903333333333336, 0.1...",[],[],2024-09-13 11:41:39.272,2024-09-13 20:03:15.818,15.699346,501.6091
2,w--4rnszliaxy12_11,"[Workpuls, Private_Links, docs.google.com, Pri...","[0.11863333333333334, 0.15068333333333334, 0.0...",[],[],2024-09-14 11:45:13.464,2024-09-14 20:01:58.878,15.652016,496.7569
3,w--4rnszliaxy12_12,"[Workpuls, discord.com, docs.google.com, Marke...","[0.06655, 0.26875, 0.06731666666666666, 0.7145...",[],[],2024-09-15 11:41:06.134,2024-09-15 20:01:12.075,13.594557,500.099017
4,w--4rnszliaxy12_13,"[Workpuls, Private_Links, discord.com, Private...","[0.08503333333333334, 0.12776666666666667, 0.9...",[],[],2024-09-16 09:36:52.481,2024-09-16 20:15:06.288,15.81185,638.230117


#### ^^^^^^^^^^^^^^^^^^^

### Creating working day function

In [16]:
import pandas as pd
from datetime import timedelta

def create_working_day(df, max_workday_gap=timedelta(hours=2)):
    result = []
    
    for employee_id, group in df.groupby('employeeId'):
        # Sort the group by 'start_time' to process logs in chronological order
        group = group.sort_values('start_time').reset_index(drop=True)
        
        daily_apps = []
        daily_durations = []
        daily_app_start_times = []
        daily_app_end_times = []
        day_counter = 1
        last_end = None
        start_time_of_workday = None

        for idx, row in group.iterrows():
            app_name = row['app']
            start_time = row['start_time']
            end_time = row['end_time']

            if last_end is None:
                # Initialize start_time_of_workday and last_end
                start_time_of_workday = start_time
                last_end = end_time

                daily_apps.append(app_name)
                daily_durations.append((end_time - start_time).total_seconds() / 60)
                daily_app_start_times.append(start_time)
                daily_app_end_times.append(end_time)
                continue

            gap = start_time - last_end

            # If the gap is larger than or equal to max_workday_gap, start a new workday
            if gap >= max_workday_gap:
                # Append the current workday to result
                result.append({
                    'employeeId': f'{employee_id}_{day_counter}',
                    'app': daily_apps,
                    'app_durations': daily_durations,
                    'app_start_times': daily_app_start_times,
                    'app_end_times': daily_app_end_times,
                    'start_time': start_time_of_workday,
                    'end_time': last_end
                })
                
                day_counter += 1
                daily_apps = []
                daily_durations = []
                daily_app_start_times = []
                daily_app_end_times = []
                start_time_of_workday = start_time
                last_end = end_time

                # Start the new workday with the current app
                daily_apps.append(app_name)
                daily_durations.append((end_time - start_time).total_seconds() / 60)
                daily_app_start_times.append(start_time)
                daily_app_end_times.append(end_time)
                continue

            # If the gap is 20 seconds or less, label as "Log Lost/Software Bug"
            if timedelta(seconds=0) < gap <= timedelta(seconds=20):
                daily_apps.append('Log Lost/Software Bug')
                daily_durations.append(gap.total_seconds() / 60)
                daily_app_start_times.append(last_end)
                daily_app_end_times.append(start_time)
            
            # If the gap is larger than 20 seconds and less than max_workday_gap, label as "Pause"
            elif timedelta(seconds=20) < gap < max_workday_gap:
                daily_apps.append('Pause')
                daily_durations.append(gap.total_seconds() / 60)
                daily_app_start_times.append(last_end)
                daily_app_end_times.append(start_time)

            # Add the current app duration
            daily_apps.append(app_name)
            daily_durations.append((end_time - start_time).total_seconds() / 60)
            daily_app_start_times.append(start_time)
            daily_app_end_times.append(end_time)

            # Update last_end to the current app's end time
            last_end = end_time

        # Append any remaining apps and durations for the final day
        if daily_apps:
            result.append({
                'employeeId': f'{employee_id}_{day_counter}',
                'app': daily_apps,
                'app_durations': daily_durations,
                'app_start_times': daily_app_start_times,
                'app_end_times': daily_app_end_times,
                'start_time': start_time_of_workday,
                'end_time': last_end
            })

    # Create a DataFrame from the result list
    result_df = pd.DataFrame(result)
    
    return result_df



In [17]:
result_df = create_working_day(df_merged, max_workday_gap=timedelta(hours=1))

### Aditional processings 

In [19]:
def merge_log_lost_and_same_apps(df):
    """
    This function processes the DataFrame per employee per day.
    For each row (which represents a workday for an employee), it processes the lists in 'app',
    'app_durations', 'app_start_times', 'app_end_times', and performs the following:
    - Merges any 'Log Lost/Software Bug' entries with the previous app:
        - The 'Log Lost/Software Bug' entry is removed.
        - The previous app's end time and duration are extended to include the 'Log Lost/Software Bug' period.
    - Merges any two apps that are the same and next to each other, and the first app's end time is the same as the second app's start time:
        - The two app entries are merged into one.
        - The duration is summed.
        - The start time is the start time of the first app.
        - The end time is the end time of the second app.
    """
    import pandas as pd

    # Create a list to store processed rows
    processed_rows = []
    
    for idx, row in df.iterrows():
        apps = row['app']
        durations = row['app_durations']
        app_starts = row['app_start_times']
        app_ends = row['app_end_times']
        employeeId = row['employeeId']
        day_start = row['start_time']
        day_end = row['end_time']
        
        # Initialize lists for processed data
        new_apps = []
        new_durations = []
        new_app_starts = []
        new_app_ends = []
        
        i = 0
        while i < len(apps):
            app_name = apps[i]
            duration = durations[i]
            start_time = app_starts[i]
            end_time = app_ends[i]
            
            # If the app is 'Log Lost/Software Bug'
            if app_name == 'Log Lost/Software Bug':
                # Merge with previous app
                if len(new_apps) > 0:
                    # Extend the previous app's end time and duration
                    new_durations[-1] += duration
                    new_app_ends[-1] = end_time
                else:
                    # If there is no previous app, skip it
                    pass
                i += 1
                continue
            
            # If current app is same as previous and end time of previous is same as start time
            if (len(new_apps) > 0 and
                app_name == new_apps[-1] and
                new_app_ends[-1] == start_time):
                # Merge with previous app
                new_durations[-1] += duration
                new_app_ends[-1] = end_time
            else:
                # Add new app
                new_apps.append(app_name)
                new_durations.append(duration)
                new_app_starts.append(start_time)
                new_app_ends.append(end_time)
            i += 1
        
        # Create a new row with the processed data
        new_row = {
            'employeeId': employeeId,
            'app': new_apps,
            'app_durations': new_durations,
            'app_start_times': new_app_starts,
            'app_end_times': new_app_ends,
            'start_time': day_start,
            'end_time': day_end
        }
        processed_rows.append(new_row)
    
    # Create a new DataFrame
    df_processed = pd.DataFrame(processed_rows)
    return df_processed

def delete_working_days(df):
    """
    This function removes all the working days (rows) where 'start_time' is between
    5th of September 2024 and 13th of September 2024.
    """

    # Ensure 'start_time' is in datetime format
    df['start_time'] = pd.to_datetime(df['start_time'])

    # Define the date range
    start_date = pd.to_datetime('2024-09-05')
    end_date = pd.to_datetime('2024-09-13')

    # Filter out the rows within the specified date range
    df_filtered = df[~((df['start_time'] >= start_date) & (df['start_time'] <= end_date))]

    # Reset index if necessary
    df_filtered = df_filtered.reset_index(drop=True)
    return df_filtered


In [20]:
# First, merge 'Log Lost/Software Bug' entries and consecutive same apps
df_final = merge_log_lost_and_same_apps(result_df)

# Then, delete the specified working days
df_final = delete_working_days(df_final)

### Adding info in how much does the next day starts?

In [22]:
# Convert 'start_time' and 'end_time' to datetime objects
df_final['start_time'] = pd.to_datetime(df_final['start_time'])
df_final['end_time'] = pd.to_datetime(df_final['end_time'])

# Extract base employee ID
df_final['base_employeeId'] = df_final['employeeId'].str.extract(r'^(.*)_\d+$')
df_final['base_employeeId'] = df_final['base_employeeId'].fillna(df_final['employeeId'])

# Sort the DataFrame
df_final = df_final.sort_values(by=['base_employeeId', 'start_time']).reset_index(drop=True)

# Calculate 'next_start_time'
df_final['next_start_time'] = df_final.groupby('base_employeeId')['start_time'].shift(-1)

# Calculate 'hours_until_next_workday'
df_final['hours_until_next_workday'] = (df_final['next_start_time'] - df_final['end_time']).dt.total_seconds() / 3600
df_final['hours_until_next_workday'] = df_final['hours_until_next_workday'].fillna(-1)

# Calculate 'workday_duration' in minutes
df_final['workday_duration'] = (df_final['end_time'] - df_final['start_time']).dt.total_seconds() / 60

# Drop temporary columns if not needed
df_final = df_final.drop(columns=['next_start_time', 'base_employeeId'])

# Verify the updated DataFrame
print(df_final[['employeeId', 'start_time', 'end_time', 'workday_duration', 'hours_until_next_workday']])


              employeeId              start_time                end_time  \
0      w--4rnszliaxy12_1 2024-09-02 09:23:12.586 2024-09-02 12:20:17.098   
1      w--4rnszliaxy12_2 2024-09-02 13:35:03.404 2024-09-02 20:01:35.212   
2     w--4rnszliaxy12_10 2024-09-13 11:41:39.272 2024-09-13 20:03:15.818   
3     w--4rnszliaxy12_11 2024-09-14 11:45:13.464 2024-09-14 20:01:58.878   
4     w--4rnszliaxy12_12 2024-09-15 11:41:06.134 2024-09-15 20:01:12.075   
...                  ...                     ...                     ...   
7317  w1rgkwz_w-eoxs0_15 2024-10-15 22:46:29.786 2024-10-16 00:45:00.000   
7318  w1rgkwz_w-eoxs0_16 2024-10-16 12:27:49.192 2024-10-16 17:00:00.000   
7319  w1rgkwz_w-eoxs0_17 2024-10-16 18:35:27.107 2024-10-16 22:15:00.000   
7320  w1rgkwz_w-eoxs0_18 2024-10-17 12:21:55.256 2024-10-17 23:30:00.000   
7321  w1rgkwz_w-eoxs0_19 2024-10-18 12:10:34.193 2024-10-18 22:48:16.256   

      workday_duration  hours_until_next_workday  
0           177.075200              

### Handling situations where len of day is 45 min or less and merging some splitting inside one working day - 1st

In [23]:
import pandas as pd

def process_workdays(df):
    # Ensure 'start_time' and 'end_time' are datetime objects
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])

    # Extract base employee ID (e.g., 'emp_1' from 'emp_1_30')
    df['base_employeeId'] = df['employeeId'].str.extract(r'^(.*)_\d+$')
    df['base_employeeId'] = df['base_employeeId'].fillna(df['employeeId'])

    # Sort the DataFrame by 'base_employeeId' and 'start_time'
    df = df.sort_values(by=['base_employeeId', 'start_time']).reset_index(drop=True)

    # Delete all rows where 'workday_duration' < 45 minutes
    df = df[df['workday_duration'] >= 45].reset_index(drop=True)

    # Recalculate 'hours_until_next_workday' after deletion
    def recalculate_hours_until_next_workday(emp_df):
        emp_df = emp_df.sort_values('start_time').reset_index(drop=True)
        emp_df['next_start_time'] = emp_df['start_time'].shift(-1)
        emp_df['hours_until_next_workday'] = (emp_df['next_start_time'] - emp_df['end_time']).dt.total_seconds() / 3600
        emp_df['hours_until_next_workday'] = emp_df['hours_until_next_workday'].fillna(-1)
        return emp_df.drop(columns=['next_start_time'])

    # Reset index to ensure unique indices before grouping
    df = df.reset_index(drop=True)

    # Add a temporary unique identifier for each row
    df['unique_id'] = df.index

    # Apply the function and reset the index to avoid duplicate labels
    df = df.groupby('base_employeeId', group_keys=False).apply(recalculate_hours_until_next_workday).reset_index(drop=True)

    # Now, for each employee, merge adjacent workdays where 'hours_until_next_workday' < 3 hours
    indices_to_drop = set()
    for employee_id in df['base_employeeId'].unique():
        emp_df = df[df['base_employeeId'] == employee_id].sort_values('start_time').reset_index(drop=True)
        idx = 0
        while idx < len(emp_df) - 1:
            current_row = emp_df.loc[idx]
            next_row = emp_df.loc[idx + 1]
            current_unique_id = current_row['unique_id']
            next_unique_id = next_row['unique_id']

            # Calculate actual gap between current end_time and next start_time
            actual_gap_hours = (next_row['start_time'] - current_row['end_time']).total_seconds() / 3600

            # Continue to merge while actual gap is less than 3 hours
            if 0 <= actual_gap_hours < 3:
                merge_unique_ids = [current_unique_id, next_unique_id]
                total_pause_duration = actual_gap_hours * 60  # Convert to minutes

                # Merge the workdays
                first_unique_id = merge_unique_ids[0]
                first_row_index = df.index[df['unique_id'] == first_unique_id][0]

                # Initialize merged lists with the first workday's data
                first_row = df.loc[df['unique_id'] == first_unique_id].iloc[0]
                merged_app = first_row['app'].copy()
                merged_app_durations = first_row['app_durations'].copy()
                merged_app_start_times = first_row['app_start_times'].copy()
                merged_app_end_times = first_row['app_end_times'].copy()

                # Loop through the rest of the workdays to merge
                for i in range(1, len(merge_unique_ids)):
                    uid_prev = merge_unique_ids[i - 1]
                    uid_curr = merge_unique_ids[i]

                    # Insert 'Pause' between workdays
                    merged_app.append('Pause')

                    # Retrieve times correctly
                    pause_start_time = df.loc[df['unique_id'] == uid_prev, 'end_time'].iloc[0]
                    pause_end_time = df.loc[df['unique_id'] == uid_curr, 'start_time'].iloc[0]

                    # Calculate pause duration
                    pause_duration_minutes = (pause_end_time - pause_start_time).total_seconds() / 60
                    merged_app_durations.append(pause_duration_minutes)
                    merged_app_start_times.append(pause_start_time)
                    merged_app_end_times.append(pause_end_time)

                    # Append the lists from the current workday
                    row_curr = df.loc[df['unique_id'] == uid_curr].iloc[0]
                    merged_app.extend(row_curr['app'])
                    merged_app_durations.extend(row_curr['app_durations'])
                    merged_app_start_times.extend(row_curr['app_start_times'])
                    merged_app_end_times.extend(row_curr['app_end_times'])

                # Update 'end_time', 'workday_duration', 'hours_until_next_workday'
                last_unique_id = merge_unique_ids[-1]
                last_row_index = df.index[df['unique_id'] == last_unique_id][0]
                df.at[first_row_index, 'end_time'] = df.at[last_row_index, 'end_time']
                total_workday_duration = df[df['unique_id'].isin(merge_unique_ids)]['workday_duration'].sum()
                total_workday_duration += total_pause_duration
                df.at[first_row_index, 'workday_duration'] = total_workday_duration
                df.at[first_row_index, 'hours_until_next_workday'] = df.at[last_row_index, 'hours_until_next_workday']

                # Update the merged lists
                df.at[first_row_index, 'app'] = merged_app
                df.at[first_row_index, 'app_durations'] = merged_app_durations
                df.at[first_row_index, 'app_start_times'] = merged_app_start_times
                df.at[first_row_index, 'app_end_times'] = merged_app_end_times

                # Mark other workdays for dropping
                indices_to_drop.update(merge_unique_ids[1:])

                # Update emp_df to reflect changes
                emp_df.loc[idx, 'end_time'] = df.at[first_row_index, 'end_time']
                emp_df.loc[idx, 'workday_duration'] = df.at[first_row_index, 'workday_duration']
                emp_df.loc[idx, 'hours_until_next_workday'] = df.at[first_row_index, 'hours_until_next_workday']
                emp_df = emp_df.drop(idx + 1).reset_index(drop=True)

                # Do not increment idx to check for further merges
            else:
                idx += 1  # Move to the next workday

        # Recalculate 'hours_until_next_workday' after merging
        emp_df = emp_df.sort_values('start_time').reset_index(drop=True)
        emp_df['next_start_time'] = emp_df['start_time'].shift(-1)
        emp_df['hours_until_next_workday'] = (emp_df['next_start_time'] - emp_df['end_time']).dt.total_seconds() / 3600
        emp_df['hours_until_next_workday'] = emp_df['hours_until_next_workday'].fillna(-1)

        # Update the main df with recalculated 'hours_until_next_workday'
        for idx2, row in emp_df.iterrows():
            df_index = df.index[df['unique_id'] == row['unique_id']][0]
            df.at[df_index, 'hours_until_next_workday'] = row['hours_until_next_workday']

    # Drop the rows marked for dropping
    df = df[~df['unique_id'].isin(indices_to_drop)].reset_index(drop=True)

    # Drop the temporary columns as they're no longer needed
    df = df.drop(columns=['base_employeeId', 'unique_id'])
    return df


In [24]:
df_processed = process_workdays(df_final)

  df = df.groupby('base_employeeId', group_keys=False).apply(recalculate_hours_until_next_workday).reset_index(drop=True)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# 2. Loop through the dataset and check the lengths of the list columns
list_columns = ['app', 'app_durations', 'app_start_times', 'app_end_times']
for index, row in df_processed.iterrows():
    lengths = [len(row[col]) if isinstance(row[col], list) else 0 for col in list_columns]
    if len(set(lengths)) != 1:
        print(f"Row {index} has inconsistent list lengths:")
        for col, length in zip(list_columns, lengths):
            print(f" - {col}: {length}")
        print("Row data:")
        print(row)
        print("\n")

# 3. Plot the histogram of 'workday_duration' column
plt.figure(figsize=(10, 6))
df_processed['workday_duration'].hist(bins=50, edgecolor='black')
plt.xlabel('Workday Duration (minutes)')
plt.ylabel('Frequency')
plt.title('Histogram of Workday Duration')
plt.grid(False)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming df_processed is your DataFrame

# Filter the data to include only values between 0 and 15 hours
filtered_data = df_processed[df_processed['hours_until_next_workday'].between(0, 15)]

# Define the bins from 0 to 15 hours with 100 bins
bins = np.linspace(0, 15, 101)  # 101 edges for 100 bins

# Plot the histogram
plt.figure(figsize=(10, 6))
filtered_data['hours_until_next_workday'].hist(bins=bins, edgecolor='black')
plt.xlabel('Hours Until Next Workday (Hours)')
plt.ylabel('Frequency')
plt.title('Histogram of Hours Until Next Workday (0 to 15 hours)')
plt.grid(False)
plt.show()


In [None]:
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_13'].app.values)
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_13'].app_durations.values)
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_13'].app_start_times.values)
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_13'].app_end_times.values)
print('-------------------------------------------------------------------------------------------------------------------------')
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_14'].app.values)
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_14'].app_durations.values)
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_14'].app_start_times.values)
print(df_final[df_final.employeeId == 'w--4rnszliaxy12_14'].app_end_times.values)

In [None]:
print(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_13'].app.values)
print(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_13'].app_durations.values)
print(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_13'].app_start_times.values)
print(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_13'].app_end_times.values)

In [None]:
len_processed = len(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_20'].app.values[0])
len_processed2 = len(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_20'].app_durations.values[0])
len_processed3 = len(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_20'].app_start_times.values[0])
len_processed4 = len(df_processed[df_processed.employeeId == 'w--4rnszliaxy12_20'].app_end_times.values[0])

print(len_processed)
print(len_processed2)
print(len_processed3)
print(len_processed4)

len_final = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_20'].app.values[0])
len_final2 = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_20'].app_durations.values[0])
len_final3 = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_20'].app_start_times.values[0])
len_final4 = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_20'].app_end_times.values[0])

print(len_final)
print(len_final2)
print(len_final3)
print(len_final4)

len_final = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_21'].app.values[0])
len_final2 = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_21'].app_durations.values[0])
len_final3 = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_21'].app_start_times.values[0])
len_final4 = len(df_final[df_final.employeeId == 'w--4rnszliaxy12_21'].app_end_times.values[0])

print(len_final)
print(len_final2)
print(len_final3)
print(len_final4)

In [None]:
# Assuming 'df_final' is your DataFrame

# Specify the employeeId
employee_id = 'w--4rnszliaxy12_20'

# Get the row corresponding to the employee
employee_row = df_processed[df_processed['employeeId'] == employee_id]

# Check if the employee exists in the DataFrame
if employee_row.empty:
    print(f"Employee with ID '{employee_id}' not found in the DataFrame.")
else:
    # Extract the 'app' list
    app_list = employee_row['app'].values[0]
    
    # Find the indexes where 'app' is 'Pause'
    pause_indexes = [index for index, value in enumerate(app_list) if value == 'Pause']
    
    # Extract the 'app_durations' list
    app_durations_list = employee_row['app_durations'].values[0]
    app_start_list = employee_row['app_start_times'].values[0]
    app_end_list = employee_row['app_end_times'].values[0]
    
    # Retrieve durations at the pause indexes
    pause_durations = [app_durations_list[index] for index in pause_indexes]
    pause_start = [app_start_list[index] for index in pause_indexes]
    pause_end = [app_end_list[index] for index in pause_indexes]
    
    # Print the results
    print(f"Indexes of 'Pause' in 'app' list: {pause_indexes}")
    print(f"Corresponding durations in 'app_durations': {pause_durations}")
    print(f"Corresponding durations in 'app_durations': {pause_start}")
    print(f"Corresponding durations in 'app_durations': {pause_end}")


In [None]:
df_processed.to_csv("day_point_dataset_testset.csv", sep=';')

### Finding similar apps based on the usage patterns

In [None]:
# Train Word2Vec model on the app column
import pandas as pd
import ast
from gensim.models import Word2Vec
from pathlib import Path

df_processed = pd.read_csv("day_point_dataset_v5.csv", sep=';')
df_processed['app'] = df_processed['app'].apply(ast.literal_eval)

# Convert the 'apps' column to a list of lists for Word2Vec
app_sequences = df_processed['app'].tolist()

model = Word2Vec(
    sentences=app_sequences,
    vector_size=100,
    window=4,
    min_count=5,
    sg=1,
    workers=4,
    epochs=20 
)

# Save the trained model for later use
model.save("apps_word2vec_v3.model")

In [None]:
app_sequences

### Testing the Word2Vec model

In [None]:
app_vector = model.wv['app.slack.com']
print("Vector for Visual_Studio_Code:", app_vector)
 
# Find similar apps to 'VSCode'
similar_apps = model.wv.most_similar('app.slack.com', topn=5)
print("Apps similar to Visual_Studio_Code:", similar_apps)

### Inference and clusterization

In [None]:
from gensim.models import Word2Vec

model = Word2Vec.load("apps_word2vec_v2.model")

# Retrieve training parameters
vector_size = model.vector_size
window = model.window
min_count = model.min_count
sg = model.sg
epochs = model.epochs
workers = model.workers
sample = model.sample
negative = model.negative
alpha = model.alpha

# Print the training parameters
print(f"Vector Size (embedding dimension): {vector_size}")
print(f"Window Size (context): {window}")
print(f"Min Count (frequency threshold): {min_count}")
print(f"Skip-Gram (sg=1) or CBOW (sg=0): {sg}")
print(f"Epochs: {epochs}")
print(f"Workers (threads): {workers}")
print(f"Sample (subsampling rate): {sample}")
print(f"Negative Sampling (negative): {negative}")
print(f"Initial Learning Rate (alpha): {alpha}")


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Extract embeddings for all apps
app_vectors = model.wv.vectors

# Run KMeans with varying numbers of clusters and calculate WCSS
wcss = []
for i in range(1, 100):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(app_vectors)
    wcss.append(kmeans.inertia_)

# Plot WCSS to observe the "elbow"
plt.figure(figsize=(8, 5))
plt.plot(range(1, 100), wcss, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.show()


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Perform hierarchical clustering using linkage
Z = linkage(model.wv.vectors, method='average', metric='cosine')  # Average linkage with cosine distance

# Plot the dendrogram
plt.figure(figsize=(12, 8))
dendrogram(Z, truncate_mode='level', p=10, labels=model.wv.index_to_key)  # Use p to limit levels shown
plt.title("Dendrogram")
plt.xlabel("App Names")
plt.ylabel("Cosine Distance")
plt.show()


### K means

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np

# Load or define your data
data = model.wv.vectors  # Assuming data is from a Word2Vec model

# Store the best parameters for each method
best_k = None
best_k_silhouette = -1
best_threshold = None
best_threshold_silhouette = -1

# --- Finding optimal k for KMeans ---
print("Evaluating KMeans clustering...")
for k in range(2, 301):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(data)
    score = silhouette_score(data, labels, metric='cosine')
    
    if score > best_k_silhouette:
        best_k_silhouette = score
        best_k = k

    if k % 50 == 0:  # Print progress every 50 steps
        print(f"KMeans - Tested k={k}, Silhouette Score={score:.4f}")

print(f"Best KMeans clustering: k={best_k} with Silhouette Score={best_k_silhouette:.4f}\n")

### Aglomerative 

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import silhouette_score
import numpy as np

# Generate linkage matrix using cosine distance
Z = linkage(data, method='average', metric='cosine')

best_threshold = None
best_silhouette = -1
best_n_clusters = 0

print("Evaluating Hierarchical clustering with custom distance thresholds...")
# Loop over thresholds and use fcluster to cut the dendrogram
for threshold in np.arange(0.1, 1.1, 0.01):  # Adjust step size as needed
    labels = fcluster(Z, t=threshold, criterion='distance')
    
    n_clusters = len(set(labels))
    print(f"Threshold {threshold:.2f} resulted in {n_clusters} clusters.")
    
    # Ensure valid number of clusters
    if n_clusters < 2 or n_clusters >= len(data) * 0.9:
        continue
    
    # Calculate silhouette score
    score = silhouette_score(data, labels, metric='cosine')
    
    if score > best_silhouette:
        best_silhouette = score
        best_threshold = threshold
        best_n_clusters = n_clusters

    print(f"Threshold={threshold:.2f}, Silhouette Score={score:.4f}")

print(f"Best threshold: {best_threshold}, Number of clusters: {best_n_clusters}, Best Silhouette Score: {best_silhouette:.4f}")


### Representation

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
import json

# Assuming `data` contains your app embedding vectors
# Step 1: Generate the linkage matrix using hierarchical clustering
Z = linkage(data, method='average', metric='cosine')

# Step 2: Define the best threshold based on previous analysis
threshold = best_threshold  # Replace with the best threshold from your analysis

# Step 3: Generate labels based on the threshold
labels = fcluster(Z, t=threshold, criterion='distance')

# Step 4: Create a dictionary to store clusters
clusters = {}
app_names = model.wv.index_to_key  # Assuming these are the app names corresponding to embeddings

# Step 5: Populate the dictionary with cluster labels
for app, label in zip(app_names, labels):
    cluster_key = f"cluster_{label}"
    if cluster_key not in clusters:
        clusters[cluster_key] = []
    clusters[cluster_key].append(app)

# Print and inspect the clusters (optional)
print(clusters)

# Save clusters to JSON
with open("app_clusters.json", "w") as json_file:
    json.dump(clusters, json_file, indent=4)

print("Clusters saved to app_clusters.json")


### Visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

# Extract the embeddings
app_vectors = model.wv.vectors

# Perform t-SNE for dimensionality reduction to 2D
tsne = TSNE(n_components=2, random_state=42)
app_vectors_2d = tsne.fit_transform(app_vectors)

# Convert labels to a numpy array for indexing
labels = np.array(labels)

# Plot each cluster in a different color
plt.figure(figsize=(12, 8))
unique_labels = set(labels)

for label in unique_labels:
    indices = np.where(labels == label)
    cluster_points = app_vectors_2d[indices]
    
    # Plot the points for each cluster
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {label}", alpha=0.6)

# Optional: Annotate points with app names
for i, app_name in enumerate(app_names):
    plt.annotate(app_name, (app_vectors_2d[i, 0], app_vectors_2d[i, 1]), fontsize=6, alpha=0.5)

plt.title("App Clusters Visualization")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend()
plt.show()


### Extracting all embeddings

In [None]:
# Retrieve all words and their embeddings
words = model.wv.index_to_key  # List of all words in the vocabulary
embeddings = {word: model.wv[word] for word in words}  # Dictionary of word: embedding

# Convert embeddings to a DataFrame
embeddings_df = pd.DataFrame.from_dict(embeddings, orient='index')

# Export to CSV
embeddings_df.to_csv("all_word_embeddings.csv", index_label="word", sep=';')

# (Optional) Export to JSON
embeddings_df.to_json("all_word_embeddings.json", orient="index")

print("All embeddings have been saved to 'all_word_embeddings.csv' and 'all_word_embeddings.json'")

In [None]:
df_sliced = df

df_sliced['start_time'] = pd.to_datetime(df_sliced['start'], unit='ms')
df_sliced['end_time'] = pd.to_datetime(df_sliced['end'], unit='ms')

df_sliced = (
    df_sliced
    .sort_values(by=['employeeId', 'start_time'], ascending=[True, True])
    .reset_index(drop=True)
)

df_sliced['duration'] = (df_sliced['end_time'] - df_sliced['start_time']).dt.total_seconds()

df_sliced['start_date'] = df_sliced['start_time'].astype(str).apply(lambda x: x.split(' ')[0])
df_sliced['end_date'] = df_sliced['end_time'].astype(str).apply(lambda x: x.split(' ')[0])

display(df_sliced.head())
print(f'Shape: {df_sliced.shape}.\n')

display(
    df_sliced
    .start_date
    .value_counts()
    .sort_index()
    .reset_index()
    .assign(proportion=lambda x: ((x['count'] / x['count'].sum())*100).round(2).astype('str') + ' %')
    .assign(cum_count=lambda x: x['count'].cumsum())
    .assign(cum_proportion=lambda x: ((x['cum_count'] / x['count'].sum())*100).round(2).astype(str) + ' %')
)


In [None]:
df_sliced = df

df_sliced['start_time'] = pd.to_datetime(df_sliced['start'], unit='ms')
df_sliced['end_time'] = pd.to_datetime(df_sliced['end'], unit='ms')

df_sliced['start_date'] = df_sliced['start_time'].astype(str).apply(lambda x: x.split(' ')[0])
df_sliced['end_date'] = df_sliced['end_time'].astype(str).apply(lambda x: x.split(' ')[0])

df_sliced['duration'] = (df_sliced['end_time'] - df_sliced['start_time']).dt.total_seconds()

employee_date_groupby = df_sliced.groupby(by=['employeeId', 'start_date'])

df_employee_date = (
    employee_date_groupby['start_time']
    .min()
    .reset_index()
    .merge(right=(
                    employee_date_groupby['end_time']
                    .max()
                    .reset_index()
                 ),
           how='inner',
           on=['employeeId', 'start_date'])
    .sort_values(by=['employeeId', 'start_date'], ascending=True)
    .assign(start_minus_end=lambda x: (x['end_time'] - x['start_time']).astype(str).apply(lambda x: x.split(' ')[2]))
    .rename(columns={'start_minus_end':'start_time - end_time'})
    .merge(right=(
                    employee_date_groupby['app']
                    .count()
                    .reset_index()
                    .rename(columns={'app':'app_count'})
                 ),
           how='inner',
           on=['employeeId', 'start_date'])
    .merge(right=(
                    employee_date_groupby['app']
                    .nunique()
                    .reset_index()
                    .rename(columns={'app':'app_count_unique'})
                 ),
           how='inner',
           on=['employeeId', 'start_date'])
    .merge(right=(
                    employee_date_groupby['duration']
                    .min()
                    .reset_index()
                    .round(3)
                    .rename(columns={'duration':'min_app_duration (s)'})
                 ),
           how='inner',
           on=['employeeId', 'start_date'])
    .merge(right=(
                    employee_date_groupby['duration']
                    .max()
                    .reset_index()
                    .round(3)
                    .rename(columns={'duration':'max_app_duration (s)'})
                 ),
           how='inner',
           on=['employeeId', 'start_date'])
    .merge(right=(
                    employee_date_groupby['duration']
                    .mean()
                    .reset_index()
                    .round(3)
                    .rename(columns={'duration':'average_app_duration (s)'})
                 ),
           how='inner',
           on=['employeeId', 'start_date'])
    .merge(right=(
                    employee_date_groupby['duration']
                    .sum()
                    .round(3)
                    .reset_index()
                    .rename(columns={'duration':'total_duration (s)'})
                 ),
           how='inner',
           on=['employeeId', 'start_date'])
    .set_index(['employeeId', 'start_date'])
)

display(df_employee_date)

In [None]:
unique_employee = df_employee_date[df_employee_date['total_duration (s)'] < 10000.0].index.get_level_values(0).nunique()
unique_date = df_employee_date[df_employee_date['total_duration (s)'] < 10000.0].index.get_level_values(1).nunique()

unique_employee_all = df_employee_date.index.get_level_values(0).nunique()
unique_date_all = df_employee_date.index.get_level_values(1).nunique()

print(f"Unique dates filtered: {unique_date}")
print(f"Unique dates all: {unique_date_all}\n\n")

print(f"Unique employee filtered: {unique_employee}")
print(f"Unique employee all: {unique_employee_all}")



In [None]:
df = pd.DataFrame(data)

# convert start and end from unix to datetime 
df['start'] = pd.to_datetime(df['start'], unit='ms')
df['end'] = pd.to_datetime(df['end'], unit='ms')

df['transaction_lasted'] = (df['end'] - df['start']).dt.seconds

# reorder columns
columns_order = [
    'id',
    'teamId',
    'employeeId',
    'start',
    'end',
    'transaction_lasted',
    'app',
    'appId',
    'appFileName',
    'site',
    'categoryId',
    'redacted_url',
    'redacted_title',
    'keystrokes',
    'mouseScroll',
    'mouseClicks',
    'mic',
    'camera',
    'active',
    'productivity',
    'failure',
    'os'
]

# rename columns & sort values at the employee level
df = (
    df[columns_order]
    .rename(columns={'start':'start_datetime', 'end': 'end_datetime'})
    .sort_values(by=['employeeId', 'start_datetime'], ascending=[True, True])
)

display(df.tail())
print(f'Shape of the data: {df.shape}\n')
print(f'Unique values of Id column: {df.id.nunique()}')

### OpenAI API mappings

In [None]:
import openai
import time
import pandas as pd

# Set your API key here
openai.api_key = "your-api-key-here"

# Function to categorize a list of sites in batches of 100
def categorize_sites(df, site_column, categories, model="gpt-4-mini", batch_size=100):
    df['site_mapped'] = None  # Initialize the new column for mapped categories
    
    # Define the prompt template with placeholders
    prompt_template = (
        "Categorize each of the following sites into one of these categories: "
        f"{', '.join(categories)}. If a site doesn’t clearly fit, use your best judgment.\n\n"
    )

    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        site_list = batch[site_column].tolist()
        
        # Prepare the batch prompt with site indices
        sites_prompt = "\n".join([f"{j+1}. Site: {site}" for j, site in enumerate(site_list)])
        prompt = prompt_template + sites_prompt + "\n\nReturn the categories as a list."
        
        try:
            response = openai.Completion.create(
                model=model,
                prompt=prompt,
                max_tokens=500,
                temperature=0.3,
                n=1,
                stop=None
            )
            # Parse response, keeping list order aligned with input sites
            categories = response.choices[0].text.strip().splitlines()
            categories = [cat.strip() for cat in categories if cat]  # Clean and keep non-empty lines
            
            # Assign each mapped category back to the respective site in the DataFrame
            for idx, category in zip(batch.index, categories):
                df.at[idx, 'site_mapped'] = category

        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            # Assign "Error" in case of failure for each site in the batch
            for idx in batch.index:
                df.at[idx, 'site_mapped'] = "Error"
        
        # Optional delay to avoid hitting rate limits
        time.sleep(0.1)
    
    return df

# Example usage
df = pd.DataFrame({'site': ["example1.com", "example2.com", "financeportal.com", ...]})
categories = ["Finance", "Communication", "Social Media", "News", "Entertainment", ...]

df = categorize_sites(df, 'site', categories)
print(df[['site', 'site_mapped']])


In [None]:
import openai
import time
import pandas as pd

openai.api_key = "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"

def categorize_sites(df, site_column, categories, model="gpt-4-mini", batch_size=100, retries=3):
    df['site_mapped'] = None

    prompt_template = (
        "Categorize each of the following sites into one of these categories: "
        f"{', '.join(categories)}. If a site doesn’t clearly fit, use your best judgment.\n\n"
    )

    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        site_list = batch[site_column].tolist()
        
        sites_prompt = "\n".join([f"{j+1}. Site: {site}" for j, site in enumerate(site_list)])
        prompt = prompt_template + sites_prompt + "\n\nReturn the categories as a list."

        for attempt in range(retries):
            try:
                response = openai.Completion.create(
                    model=model,
                    prompt=prompt,
                    max_tokens=600,
                    temperature=0.2,
                    n=1,
                    stop=None
                )
                
                categories_response = response.choices[0].text.strip().splitlines()
                categories_response = [cat.strip() for cat in categories_response if cat]
                
                if len(categories_response) == batch_size:
                    for idx, category in zip(batch.index, categories_response):
                        df.at[idx, 'site_mapped'] = category
                    break 

                else:
                    print(f"Incomplete response for batch starting at index {i}; retrying missing entries...")

            except Exception as e:
                print(f"Error processing batch starting at index {i}, attempt {attempt + 1}: {e}")
                time.sleep(1)

        missing_indices = batch[df['site_mapped'].isnull()].index
        
        for idx in missing_indices:
            site = df.at[idx, site_column]
            single_prompt = prompt_template + f"1. Site: {site}\n\nReturn the category."

            for attempt in range(retries):
                try:
                    response = openai.Completion.create(
                        model=model,
                        prompt=single_prompt,
                        max_tokens=10,
                        temperature=0.3,
                        n=1,
                        stop=None
                    )
                    category = response.choices[0].text.strip()
                    df.at[idx, 'site_mapped'] = category
                    break  
                except Exception as e:
                    print(f"Error retrying site {site} at index {idx}, attempt {attempt + 1}: {e}")
                    time.sleep(1)
                    if attempt == retries - 1:
                        df.at[idx, 'site_mapped'] = "Error" 

    return df


df = categorize_sites(df, 'site', categories)


In [None]:
import openai
import csv

# Set your API key
openai.api_key = "YOUR_API_KEY"

# Define the list of site strings you want to categorize
sites = [
    "google.mail.com", "mail.yahoo.com", "linkedin.com",
    # Add your 250,000 site strings here
]

# Define your categories for classification
categories = ["Finance", "Communication", "Social Media", "E-commerce", "Education", "News"]
prompt_template = """
Classify the following site into one of the categories: {categories}.
Site: {site}
Category:
"""

# Helper function to format the prompt
def format_prompt(site):
    return prompt_template.format(categories=", ".join(categories), site=site)

# Divide sites into batches for the Batch API
batch_size = 100 
batched_sites = [sites[i:i + batch_size] for i in range(0, len(sites), batch_size)]

# Process each batch
results = []
for batch in batched_sites:
    # Format the prompts for the entire batch
    batch_prompts = [format_prompt(site) for site in batch]

    try:
        # Send batch request to OpenAI API
        response = openai.Completion.create(
            model="gpt-4o-mini",
            prompt=batch_prompts,
            max_tokens=15,
            temperature=0.0
        )
        
        # Collect responses, handling cases with missing or empty predictions
        for i, choice in enumerate(response.get("choices", [])):
            if 'text' in choice and choice['text'].strip():
                # Valid prediction found
                results.append((batch[i], choice['text'].strip()))
            else:
                # Handle missing prediction (assign 'Unknown' or other placeholder)
                results.append((batch[i], "Unknown"))

    except Exception as e:
        # Handle API call errors by logging and assigning 'Error' as the prediction
        print(f"Error processing batch: {e}")
        for site in batch:
            results.append((site, "Error"))

# Print the results or save to a file
for site, category in results:
    print(f"Site: {site} - Category: {category}")

# Optionally save to a CSV
with open('categorized_sites.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Site", "Category"])
    writer.writerows(results)


### Batch API

In [None]:
import json
from pathlib import Path
import openai
from openai import OpenAI
import pandas as pd
from IPython.display import Image, display

dataset_path1 = Path("./mappings/site_mappings_2nd_round.csv")
dataset_path2 = Path("./mappings/unique_apps.csv")

df_mappings = pd.read_csv(dataset_path1)
df_apps = pd.read_csv(dataset_path2)


In [None]:
all_mappings = df_mappings.site_mapping_v3.unique()

In [None]:
all_mappings

In [None]:
client = openai.OpenAI(api_key="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")

In [None]:
# Convert the list into a more readable format
categories_list = "\n".join(", ".join(all_mappings[i:i+4]) for i in range(0, len(all_mappings), 4))


categorize_system_prompt = f"""
Your goal is to categorize apps based on their general purpose and usage. You will be provided with an app name, and you will output a JSON object containing the following information:

{{
    "Category": "string" // The primary category of the app  based on its general purpose and usage
}}

The Category must be chosen exclusively from the following predefined options:
{categories_list}

Only select a single category from this list that most accurately reflects the app’s main purpose and typical usage. Do not create any new categories outside of those provided.
"""



def get_categories(website_name):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.1,
        response_format={ 
            "type": "json_object"
        },
        messages=[
            {
                "role": "system",
                "content": categorize_system_prompt
            },
            {
                "role": "user",
                "content": website_name
            }
        ],
    )

    return response.choices[0].message.content


In [None]:
categories_list

In [None]:
# Creating an array of json tasks

df2_poc = df2[:1000].copy()

tasks = []

for index, row in df2_poc.iterrows():
    
    website_name = row['site']
    
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": "gpt-4o-mini",
            "temperature": 0.1,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": categorize_system_prompt
                },
                {
                    "role": "user",
                    "content": website_name
                }
            ],
        }
    }
    
    tasks.append(task)

In [None]:
# Creating the file

file_name = "batch_sites.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [None]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [None]:
print(batch_file)

### Creating .jsonl files in a folder

In [None]:
from pathlib import Path
import pandas as pd

dataset_path1 = Path("./mappings/site_mappings_2nd_round.csv")
dataset_path2 = Path("./mappings/unique_apps.csv")

df_mappings = pd.read_csv(dataset_path1)
df_apps = pd.read_csv(dataset_path2)

all_mappings = df_mappings.site_mapping_v3.unique()

categories_list = "\n".join(", ".join(all_mappings[i:i+4]) for i in range(0, len(all_mappings), 4))


categorize_system_prompt = f"""
Your goal is to categorize a list of apps based on their general purpose and usage. You will be provided with a list of app names, and you will output a JSON object containing a single key, "Category", with its value as a list of categories. Each category in the list should correspond to an app in the input list, in the same order.

Each input app should be mapped to one of the predefined categories below:
{categories_list}

The JSON object should have the format:
{{
    "Category": [category_1, category_2, ...]
}}

The "Category" list should contain a single category for each app, corresponding to the input order, and only use categories from the provided list. Do not create any new categories outside of those given.
"""


In [None]:
categories_list

In [None]:
import os
import json
import pandas as pd

# Define paths and parameters
folder_path = './batch_files_apps'
batch_size = 1
batch_file_name = 'batch_sites'

# Check and create the folder if it does not exist
try:
    os.makedirs(folder_path, exist_ok=True)
    print(f"Folder '{folder_path}' {'created' if not os.path.exists(folder_path) else 'already exists'}.")
except OSError as e:
    print(f"Error creating folder '{folder_path}': {e}")
    exit()

# Load data
try:
    data = pd.read_csv('./unique_sites.csv')  # Adjust the file path as necessary
except FileNotFoundError as e:
    print(f"Data file not found: {e}")
    exit()
except pd.errors.EmptyDataError as e:
    print(f"Data file is empty: {e}")
    exit()
except Exception as e:
    print(f"Error loading data file: {e}")
    exit()

data = data.loc[:20, :]

# Compute number of files needed
num_files = (len(data) + batch_size - 1) // batch_size

# Process and write data in batches
for num_file in range(num_files):
    start_idx = num_file * batch_size
    end_idx = min(start_idx + batch_size, len(data))
    data_chunk = data.iloc[start_idx:end_idx]
    
    output_file = os.path.join(folder_path, f'{batch_file_name}_part{num_file}.jsonl')

    try:
        # Write each batch to the file
        with open(output_file, 'w') as file:
            payload = {
                "custom_id": f"task-{num_file}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",
                    "temperature": 0.1,
                    "response_format": { 
                        "type": "json_object"
                    },
                    "messages": [
                        {
                            "role": "system",
                            "content": categorize_system_prompt
                        },
                        {
                            "role": "user",
                            "content": [row['site'] for _, row in data_chunk.iterrows()] 
                        }
                    ],
                }
            }
                
            file.write(json.dumps(payload) + '\n')
    except IOError as e:
        print(f"Error writing to file '{output_file}': {e}")
        continue  # Skip to the next file in case of an error


### Processing jsonl files

In [None]:
import os
from openai import OpenAI

# Set OpenAI API key in the environment
OPENAI_API_KEY = "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"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
client = OpenAI()

# Directory containing batch input files
batch_directory = './batch_files'
batch_files = []

# Create file objects for the batch job
for filename in os.listdir(batch_directory):
    file_path = os.path.join(batch_directory, filename)
    batch_files.append(client.files.create(
        file=open(file_path, "rb"),
        purpose="batch"
    ))

# Get the IDs of the uploaded batch files
batch_file_ids = [batch_file.id for batch_file in batch_files]

# Create batch jobs for each file
batch_jobs = []
for index, file_id in enumerate(batch_file_ids):
    batch_jobs.append(client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
    ))

# Print the details of the created batch jobs
for batch_job in batch_jobs:
    print(batch_job)

In [None]:
import time

# Extract job IDs from the list of job creations
job_ids = [job.id for job in batch_jobs]

error_detected = False
completed_jobs = set()

while True:
    for job_id in job_ids:
        job_info = client.batches.retrieve(job_id)
        
        if job_info.status == "failed":
            # Stop loop if any job has failed
            print(f"Job {job_id} failed with error: {job_info.errors}")
            error_detected = True
            break
        
        elif job_info.status == "in_progress":
            print(f"Job {job_id} is in progress, {job_info.request_counts.completed}/{job_info.request_counts.total} requests completed")
        
        elif job_info.status == "finalizing":
            print(f"Job {job_id} is finalizing, waiting for the output file ID")
        
        elif job_info.status == "completed":
            print(f"Job {job_id} has completed")
            completed_jobs.add(job_id)
        
        else:
            print(f"Job {job_id} is in status: {job_info.status}")
    
    # Exit loop if any job failed or all jobs are completed
    if error_detected or len(completed_jobs) == len(job_ids):
        break
    
    time.sleep(20)

In [None]:
# Extract output file IDs for each job
output_file_ids = [client.batches.retrieve(job_id).output_file_id for job_id in job_ids]

# Read the content of the output files
output_files_content = [client.files.content(file_id).text for file_id in output_file_ids]

# Extract custom ID and embedding from each file
category_data = []
for content in output_files_content:
    for line in content.split('\n')[:-1]:  # Avoid processing the last empty line
        parsed_data = json.loads(line)
        custom_id = parsed_data.get('custom_id')
        print(f"custom_id: {custom_id}")
        print(f"Response: {parsed_data['response']['body']}")
        break
        embedding = parsed_data['response']['body']['data'][0]['Category']
        embedding_data.append([custom_id, embedding])

# Convert the list of embeddings to a DataFrame
embedding_df = pd.DataFrame(embedding_data, columns=['custom_id', 'embedding'])

In [None]:
client.batches.retrieve('batch_672255351ad08190949714ce7380f4d4')

In [None]:
data = data.reset_index()
data = data.rename(columns={'index':'id'})
# we extract the id from the custom_id we created to merge with the original data file
embedding_results['id'] = embedding_results['custom_id'].apply(lambda x: int(x.split('custom_id_')[1]))
data_with_embedding = df.merge(embedding_results[['id','embedding']], on='id', how='left') 
data_with_embedding.to_csv('./data/data_with_embedding.csv', index=False)

In [None]:
import json
from pathlib import Path
import openai
from openai import OpenAI
import pandas as pd
from IPython.display import Image, display

client = openai.OpenAI(api_key="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")

dataset_path1 = Path("./mappings/site_mappings_2nd_round.csv")
# dataset_path2 = Path("./mappings/unique_apps.csv")

df_mappings = pd.read_csv(dataset_path1)
# df_apps = pd.read_csv(dataset_path2)

# df_apps = df_apps.loc[:, 'app']

all_mappings = df_mappings.site_mapping_v3.unique()

categories_list = ", ".join(all_mappings)

In [None]:
df_apps.head(30)

In [None]:
categorize_system_prompt = f'''
Your goal is to categorize a list of websites based on their general purpose and usage.
You will be provided with a single string containing websites names separated by commas.
You will output a JSON object containing a single key, "Category", with its value as a list of categories.
Each category in the list should correspond to an website in the input string, in the same order.

It is crucial that every website in the input string is assigned a category in the output list. Do not skip any websites, and ensure the output list has a one-to-one mapping for each website provided.

Each input website should be mapped to one of the predefined categories below:
{categories_list}

The JSON object should have the format:

{{
    "Category": [category_1, category_2, ...] // Array of categories corresponding to each website
}}

The "Category" list should contain a single category for each website in the input string, based on the order provided, and only use categories from the given list.
Do not create any new categories outside of those specified.
'''

df = df_missed_sites.copy()

tasks = []

# Iterate over the DataFrame in steps of 10
for i in range(0, len(df), 2):

    apps = ', '.join(df.iloc[i:i+2, 0])
    apps = apps.strip()
 
    # Create the task with the concatenated descriptions
    task = {
        "custom_id": f"task-{i // 2}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "temperature": 0.1,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": categorize_system_prompt
                },
                {
                    "role": "user",
                    "content": apps
                }
            ],
        }
    }
    
    tasks.append(task)


In [None]:
import json

# Number of files to create
num_files = 1

# Splitting `tasks` list into chunks for each file
chunk_size = len(tasks) // num_files
chunks = [tasks[i * chunk_size: (i + 1) * chunk_size] for i in range(num_files - 1)]
chunks.append(tasks[(num_files - 1) * chunk_size:])  # Add remaining tasks to the last chunk

# Writing each chunk to a separate JSONL file
for i, chunk in enumerate(chunks, start=1):
    file_name = f"batch_files_2/batch_sites{i}.jsonl"
    with open(file_name, 'w') as file:
        for obj in chunk:
            file.write(json.dumps(obj) + '\n')


In [None]:
# Creating the file

file_name = "batch_files_sites/missed_vatch_sites.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

### Processing multiple jsonl files

In [None]:
import os
from openai import OpenAI


# Directory containing batch input files
batch_directory = './batch_files_2'
batch_files = []

# Create file objects for the batch job
for filename in os.listdir(batch_directory):
    file_path = os.path.join(batch_directory, filename)
    batch_files.append(client.files.create(
        file=open(file_path, "rb"),
        purpose="batch"
    ))

# Get the IDs of the uploaded batch files
batch_file_ids = [batch_file.id for batch_file in batch_files]

# Create batch jobs for each file
batch_jobs = []
for index, file_id in enumerate(batch_file_ids):
    batch_jobs.append(client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    ))

# Print the details of the created batch jobs
for batch_job in batch_jobs:
    print(batch_job)

In [None]:
import time

# Extract job IDs from the list of job creations
job_ids = [job.id for job in batch_jobs]

error_detected = False
completed_jobs = set()

while True:
    for job_id in job_ids:
        job_info = client.batches.retrieve(job_id)
        
        if job_info.status == "failed":
            # Stop loop if any job has failed
            print(f"Job {job_id} failed with error: {job_info.errors}")
            error_detected = True
            break
        
        elif job_info.status == "in_progress":
            print(f"Job {job_id} is in progress, {job_info.request_counts.completed}/{job_info.request_counts.total} requests completed")
        
        elif job_info.status == "finalizing":
            print(f"Job {job_id} is finalizing, waiting for the output file ID")
        
        elif job_info.status == "completed":
            print(f"Job {job_id} has completed")
            completed_jobs.add(job_id)
        
        else:
            print(f"Job {job_id} is in status: {job_info.status}")
    
    # Exit loop if any job failed or all jobs are completed
    if error_detected or len(completed_jobs) == len(job_ids):
        break
    
    time.sleep(120)

In [None]:
# Extract output file IDs for each job
output_file_ids = [client.batches.retrieve(job_id).output_file_id for job_id in job_ids]

# Read the content of the output files
output_files_content = [client.files.content(file_id).text for file_id in output_file_ids]

In [None]:
folder_name = "./batch_files_2/sites_result_"

for i, file in enumerate(output_files_content):
    file_name = folder_name + f'{i+1}.jsonl'
    print(file)
    with open(file_name, 'w') as fp:
        fp.write(file)


### Processing 1 jsonl file

In [None]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
) 

batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [None]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [None]:
result

In [None]:
result_file_name = "batch_files_sites/missed_batch_sites_result.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [None]:
import json

# Loading data from saved file
results = []
folder_name = "batch_files_sites/missed_batch_sites_result.jsonl"

for i in range(1):
    result_file_name = folder_name
    with open(result_file_name, 'r') as file:
        for line in file:
            # Parsing the JSON string into a dict and appending to the list of results
            json_object = json.loads(line.strip())
            results.append(json_object)

In [None]:
results[1]

In [None]:
import numpy as np

num_sites = 2


df_missed_sites["site_mapping"] = np.nan

# Populate the 'mapping' column
for i, idx in enumerate(good_index):
    start = idx * num_sites
    end = (idx + 1) * num_sites
    category = json.loads(results[idx]['response']['body']['choices'][0]['message']['content'])['Category']
    df_missed_sites.iloc[start:end, 1] = category


In [None]:
df_missed_sites.site_mapping.value_counts()

In [None]:
unique_mappingd_new = set(df_missed_sites.site_mapping.unique())
unique_mappingd_old = set(mappings_sites.site_mapping_v3.unique())

In [None]:
unique_mappingd_new - unique_mappingd_old

In [None]:
difference = list(unique_mappingd_new - unique_mappingd_old)
filtered_missed = df_missed_sites[df_missed_sites.site_mapping.isin(difference)]
filtered_missed

In [None]:
site_category_mapping = {
    'sttheresahospital.com.ng': 'Health & Wellness',
    'socialnowa.io': 'Social Networking',
    'compress-pdf.emapnet.com': 'Utilities & Tools',
    'demakufu.co.ke': 'Property & Real Estate',
    'colombiasmartfit.com.co': 'Health & Wellness',
    'holycodedoo.applytojob.com': 'Recruitment',
    'arcamax.com': 'News',
    'cdn.subsplash.com': 'Hosting & Domains',
    'hauzisha.co.ke': 'Property & Real Estate',
    'leapscholar.com': 'Education & Learning',
    'shouldiremoveit.com': 'Utilities & Tools',
    'localhost:42815': 'Local Development',
    'straightsecurity.com': 'Cybersecurity',
    'career.peek-cloppenburg.com': 'Recruitment',
    'trackstar.co.uk': 'Analytics',
    'engineeringblog.yelp.com': 'Blogging',
    'dashboard.payixy.net': 'Finance',
    'publicwin.ro': 'Gaming',
    'account.aliyun.com': 'Cloud Services',
    'dianesoldit.sites.cbmoxi.com': 'Property & Real Estate',
    'gospelreformation.net': 'Nonprofit & Community',
    'startupv.io': 'Business',
    'pb.olx.com.br': 'E-commerce',
    'myteleflora.com': 'E-commerce',
    'giddyupgreenclean.com': 'Service & Support',
    'eatntrack.ro': 'Health & Wellness',
    'allenwooooo.github.io': 'Development',
    'localhost:57013': 'Local Development',
    'freepressokc.com': 'News',
    'wesley.hu': 'Personal Blog',
    'csharpplayersguide.com': 'Education & Learning',
    'f-movies.cfd': 'Video Entertainment',
    'kompare.hr': 'Finance',
    'nrplearningplatform.com': 'Education & Learning',
    'irentlist.website': 'Property & Real Estate',
    'enjoy-web-establishments-oyezvwfyo-fatcatcoders.vercel.app': 'Website Management',
    'threadreaderapp.com': 'Blogging',
    'metropestcontroldfw.com': 'Service & Support',
    'ecsbluediamondfm-my.sharepoint.com': 'Documents & Collaboration',
    '7f1a-35-231-149-40.ngrok-free.app': 'Local Development',
    'suricata.io': 'Cybersecurity',
    'dallasfortworthtermitepestcontrol.com': 'Service & Support',
    'srv.concrete.co.ke:8090': 'Construction',
    'tradefeeds.com': 'Finance',
    'info.shopify-desk.com': 'E-commerce',
    'hostkey.com': 'Hosting & Domains',
    'hbogoasia.ph': 'Video Entertainment',
    'californ.zip': 'Miscellaneous',
    'depot.dev': 'Development',
    'thriveinfocus.com': 'Health & Wellness',
    'theblankapp.com': 'Utilities & Tools',
    'promptitude.io': 'AI Assistance',
    'hotelmemories.hu': 'Hospitality',
    'madhaafaanrealestate.com': 'Property & Real Estate',
    'rcsprouljr.com': 'Blogging',
    'techwelkin.com': 'Education & Learning',
    'vpn-panel.top-chrome': 'Cybersecurity',
    'ackstmarkswestlands.or.ke': 'Nonprofit & Community',
    'tbctopc.com': 'Education & Learning',
    'online.vitalsource.com': 'Education & Learning',
    'links.rocketlawyer.com': 'Legal & Compliance',
    'dcaa902f7ee868f656.gradio.live': 'Development',
    'outlook.ca': 'Mail',
    'mi-home.pl': 'E-commerce',
    'datacater.io': 'Data & Analytics',
    'odpc.go.ke': 'Public Sector & Politics',
    'onehourairftworth.com': 'Service & Support'
}

# Update `site_mapping` in `df_missed_sites` only if the `site` exists in `site_category_mapping`
df_missed_sites['site_mapping'] = df_missed_sites['site'].apply(
    lambda x: site_category_mapping[x] if x in site_category_mapping else None
).combine_first(df_missed_sites['site_mapping'])

# Display the updated DataFrame
df_missed_sites.head()

In [None]:
len(mappings_sites.site_mapping_v3.unique())

In [None]:
mappings_sites

In [None]:
import pandas as pd

# Rename `site_mapping_v3` to `site_mapping` in `mapping_sites`
mapping_sites = mappings_sites.rename(columns={'site_mapping_v3': 'site_mapping'})

# Concatenate `df_missed_sites` and `mapping_sites`
combined_df = pd.concat([df_missed_sites, mapping_sites], ignore_index=True)

# Display the combined DataFrame
print(combined_df)


In [None]:
combined_df.to_csv('mappings/site_mappings_2nd_round_v3.csv')

In [None]:
df_apps.app_mapping.unique()


In [None]:
res1 = app_categories = {
    'Microsoft Clock': 'Time Tracking',
    'GoToMeeting': 'Video Conferencing',
    'Search and Cortana application': 'Search Engine',
    'ScreenRec': 'Utilities & Tools',
    'ProcenatRadnika': 'Human Resources',
    'Microsoft 365 and Office': 'Documents & Collaboration',
    'Elegir una aplicación': 'Localization & Translation',
    'Microsoft Phone Link': 'Remote Access',
    'copyq': 'Utilities & Tools',
    'Вибрати програми': 'Localization & Translation',
    'Adobe After Effects CS6': 'Creative & Design',
    'MouseJiggler': 'Utilities & Tools',
    'SEO SpyGlass': 'SEO & Analytics',
    'Wacom Technology, Corp. Control Panel': 'Creative & Design',
    'clumsy.exe': 'Utilities & Tools',
    'CMFDeckDesign': 'Creative & Design',
    'Command | Update Windows Universal Application, 5.4.0, A00': 'Operating System',
    'DNSFilter Agent TrayIcon': 'Cybersecurity',
    'Izaberite aplikaciju': 'Localization & Translation',
    'Java Platform SE 8 U361': 'Development',
    'LocalSend': 'File Sharing',
    'Real HEIC to JPG Converter': 'Utilities & Tools',
    'Robot Structural Analysis Professional Preview': 'Engineering',
    'SAND/SCALE *** Structural ANalysis and Design *** Structural CALculations Ensemble ***': 'Engineering',
    'Command | Update Application, 5.4.0, A00': 'Operating System',
    'ControlPanel': 'Operating System',
    'Dependencies': 'Development',
    'ProSeriesLauncher': 'Business Software',
    'Proton VPN': 'Cybersecurity',
    'Realtek High-Definition Audio Driver, 6.0.1.6111, A06': 'Audio & Video',
    'USBPcap': 'Utilities & Tools',
    'United Parcel Service, Inc. PatchUPS': 'Logistics & Transportation',
    'Windows-Problemberichterstattung': 'Operating System',
    'Bir uygulama seçin': 'Localization & Translation',
    'Builder': 'Development',
    'Cambiar la configuración del equipo': 'Operating System'
}
res2 = app_categories = {
    'Harver System Checker': 'IT Services',
    'Intel Rapid Storage Technology Driver and Application, 19.5.1.1040, A02': 'Operating System',
    'Intel(R) Network Configuration Services': 'Networking',
    'Fill App(Windows)': 'Utilities & Tools',
    'Firmware Downloader and ZBI Key Manager Application': 'Operating System',
    'GitLab.UI': 'Development',
    'VoiceTyper': 'AI Assistance',
    'VoicemeeterProSetup': 'Audio & Video',
    'Wacom Technology, Corp. Deployer/Undeployer': 'Creative & Design',
    'Install ClickUp': 'Project Management',
    'Intel BE200/AX411/AX211/AX210/AX201/AX200/9560/9462/9260 Wi-Fi UWD Driver, 23.60.1.2, A45': 'Networking',
    'Intel Graphics Properties': 'Operating System',
    'Intel HID Event Filter Driver, 2.2.2.9, A21': 'Operating System',
    'Intel Management Engine Interface (MEI) Driver': 'Operating System',
    'Jet Reports Services': 'Business Software',
    'TimerMS': 'Time Tracking',
    'Toshiba MQ04ABF100 Hard Drive Firmware Update, 10.0003, A00': 'Operating System',
    'Tradução do jogo: Chrono Trigger': 'Localization & Translation',
    'Intel 3165/7265/8260/8265 Wi-Fi UWD Driver, 22.130.0.5, A17': 'Networking',
    'Intel(R) Arc(TM) Control': 'Operating System',
    'Keyboard Layout': 'Operating System',
    'Command Deploy Driver Pack for Latitude 5410, 1.0, A01': 'Operating System',
    'CoreChip driver install': 'Operating System',
    'CortexTools3™': 'Development',
    'Dell Optimizer Application, 4.1.353.0, A00': 'IT Services',
    'DiskGenius': 'Data Recovery',
    'DisplayLink Core Software v10.1.2875.0': 'Utilities & Tools',
    'Install Notion Calendar': 'Calendar & Scheduling',
    'Insyde H2OFFT': 'Operating System',
    'Intel BE202/BE200/AX411/AX211/AX210/AX201/AX200/9560/9462/9260 Wi-Fi UWD Driver, 23.60.1.2, A57': 'Networking',
    'M-TAG': 'IT Services',
    'MAMP & MAMP PRO 4.2.0': 'Local Development',
    'MDSuDS': 'Engineering',
    'VisualSFM': 'Creative & Design',
    'WPS Office安装程序': 'Documents & Collaboration',
    'Waves MaxxAudio Pro Application, 1.1.131.0, A06': 'Audio & Video',
    'Cinnamon-display-changes-dialog': 'Operating System',
    'Cirrus Logic High Definition Audio Driver, 10.0.6.18, A08': 'Audio & Video',
    'Cisco Secure Endpoint': 'Cybersecurity',
    'Dell Firmware Update': 'Operating System',
    'Dell On-Screen Display Application, 1.0.1.0, A00': 'Operating System',
    'Dell SupportAssist OS Recovery Plugin for Dell Update': 'Operating System',
    'FontDownloadConfirmationTest': 'Printing & Publishing',
    'Foxit PDF Editor Printer: Reliable, Affordable, Efficient': 'Document Management',
    'FurMark2 x64': 'Testing',
    'Intel High-Definition (HD) Graphics Driver': 'Operating System',
    'Intel Integrated Sensor Solution Driver, 3.10.100.4478, A02': 'Operating System',
    'Intel Management Engine Interface Driver, 2345.5.42.0, A04': 'Operating System',
    'Intel Serial IO Driver, 30.100.2020.7, A01': 'Operating System',
    'Intel UHD/Iris Xe Graphics Driver and Intel Graphics Command Center Application, 31.0.101.5333, A16': 'Operating System',
    'Intel UHD/Iris Xe Graphics Driver, 31.0.101.5333, A16': 'Operating System',
    'Intel UHD/Iris Xe/Iris Plus Graphics Driver, 31.0.101.5333, A10': 'Operating System',
    'Intel® Driver & Support Assistant': 'IT Services',
    'IoT Remote': 'Remote Access',
    'Isoimagewriter': 'Utilities & Tools',
    'Java Platform SE 8 U411': 'Development',
    'KDI OM3PGP4 PCIe NVMe Solid State Drive Firmware Update, 4130.0004, A00': 'Operating System',
    'OS Recovery Tool, 2.4.0.7813, A00': 'Data Recovery',
    'OTPKEY.Authenticator': 'Cybersecurity',
    'Office': 'Documents & Collaboration',
    'Optimizer Application, 4.2.3.0, A00': 'Utilities & Tools',
    'PC PRINT': 'Printing & Publishing',
    'PCmoverPopup': 'Utilities & Tools',
    'Power Manager Service, 3.16.0, A00': 'Utilities & Tools',
    'PowerShell 7.4.3.0-x64': 'Development',
    'PowerToys.MonacoPreviewHandler': 'Utilities & Tools',
    'Razer.Synapse3.Installer': 'Gaming',
    'Realtek Card Reader': 'Utilities & Tools',
    'Realtek High Definition Audio Driver, 6.0.8934.1, A17': 'Audio & Video',
    'Realtek High Definition Audio Driver, 6.0.9517.1, A83': 'Audio & Video',
    'Realtek IR Camera Driver, 10.0.15063.20012, A10': 'Utilities & Tools',
    'Realtek RTL8821CE/RTL8822CE Wi-Fi and Bluetooth Driver, 2024.10.139.3, A13': 'Networking',
    'Realtek RTL8821CE/RTL8822CE Wi-Fi and Bluetooth Driver, 2024.10.139.3, A15': 'Networking',
    'Realtek USB Audio DCH Driver, 6.3.9600.2370, A22': 'Audio & Video',
    'Reason Safer Web': 'Cybersecurity',
    'Samsung PM9A1 Solid State Drive Firmware Update, 3631.0229, A00': 'Operating System',
    'Samsung Printer Experience': 'Printing & Publishing',
    'ScanSnap Folder': 'Document Management',
    'Thunking WIA APIS from 32 to 64 Process': 'Utilities & Tools',
    'TicketingTray': 'Customer Support',
    'Touchpad Firmware Update Utility, 1160.4171.51, A02': 'Operating System',
    'ReSharper Visual Studio Marketplace Installer': 'Development',
    'Realtek High Definition Audio Driver, 6.0.9147.1, A19': 'Audio & Video',
    'Realtek High Definition Audio Driver, 6.0.9175.1, A27': 'Audio & Video',
    'TeamSpeak 3 Client Error Reporter': 'Communication',
    'TechPowerUp GPU-Z': 'Analytics',
    'Tekla Portal Frame Designer and Tekla Connection Designer 24': 'Engineering'
}

# Concatenate dictionaries
combined_dict = {**res1, **res2}

# Create DataFrame
df = pd.DataFrame(list(combined_dict.items()), columns=['app', 'app_mapings'])

len(df.app_mapings.unique())

In [None]:
df_apps.columns

In [None]:
df.columns

In [None]:
# Merge df_apps with df
df_apps = df_apps.merge(df[['app', 'app_mapings']], on='app', how='left', suffixes=('', '_new'))

# Check if 'app_mapings_new' was created during the merge
if 'app_mapings_new' in df_apps.columns:
    # Use assignment instead of inplace=True to avoid warnings
    df_apps['app_mapping'] = df_apps['app_mapping'].fillna(df_apps['app_mapings_new'])
    
    # Drop the temporary 'app_mapings_new' column
    df_apps.drop(columns=['app_mapings_new'], inplace=True)
else:
    print("Column 'app_mapings_new' was not created. Please check if 'app' values in both dataframes match correctly.")


In [None]:
# Step 1: Merge `df_apps` and `df` to check mappings, with troubleshooting output
merged = df_apps.merge(df[['app', 'app_mapings']], on='app', how='left', suffixes=('_apps', '_df'))

# Display columns in merged DataFrame to verify suffixes
print("Merged DataFrame columns:", merged.columns)

# Confirm if 'app_mapping_apps' and 'app_mapings_df' exist
if 'app_mapping' in merged.columns and 'app_mapings' in merged.columns:
    # Identify inconsistent mappings
    inconsistent_apps = merged[merged['app_mapping'] != merged['app_mapings']]['app'].unique()
    
    if len(inconsistent_apps) > 0:
        print("Inconsistent app mappings found for apps:", inconsistent_apps)
    else:
        print("All mappings are consistent between `df_apps` and `df`.")
else:
    print("Merged DataFrame did not create the expected columns. Please check column names.")

# Step 2: Remove rows in `df_apps` where `app` has NaN values
df_apps = df_apps.dropna(subset=['app'])

# Step 3: Fill NaN values in `app_mapping` with "Other"
df_apps['app_mapping'].fillna("Other", inplace=True)

# Display the updated df_apps
df_apps.head()


In [None]:
len(df_apps.app_mapping.unique())

In [None]:
df_apps

In [None]:
df_apps

In [None]:
path = "mappings/app_mappings_1st_rounds.csv"

with open(path, 'w') as fp:
    df_apps.to_csv(fp)

In [None]:
# Merge df_apps with df, specifying the different column names for app_mapping
df_apps = df_apps.merge(df[['app', 'app_mapings']], on='app', how='left', suffixes=('', '_new'))

# Update app_mapping column in df_apps where it is None using values from app_mapings in df
df_apps['app_mapping'].fillna(df_apps['app_mapings_new'], inplace=True)

# Drop the extra column after filling values
df_apps.drop(columns=['app_mapings_new'], inplace=True)


In [None]:
# Merge df2 with df2_second on 'site' to get matching 'site_mapping' values
df2 = df2.merge(df2_second[['site', 'site_mapping']], on='site', how='left', suffixes=('', '_second'))

# Update site_mapping in df2 only where it exists in df2_second
df2['site_mapping'] = df2['site_mapping'].combine_first(df2['site_mapping_second'])

# Drop the extra column from the merge
df2.drop(columns=['site_mapping_second'], inplace=True)

# Display the result to verify
print(df2.head())


### Comparasion

In [None]:
# Sample 50 unique sites from df2_second
sample_sites = df2_second['site'].drop_duplicates().sample(500, random_state=1)

# Filter both df2 and df2_second with the sampled sites
df2_sample = df2[df2['site'].isin(sample_sites)]
df2_second_sample = df2_second[df2_second['site'].isin(sample_sites)]

# Merge the two samples on 'site' to compare 'site_mapping' values side by side
comparison_df = df2_sample[['site', 'site_mapping']].merge(
    df2_second_sample[['site', 'site_mapping']],
    on='site',
    suffixes=('_df2', '_df2_second')
)

# Display the comparison
print(comparison_df)


In [None]:
df2[df2.site_mapping.isna()]

In [None]:
df2['site_mapping'] = df2['site_mapping'].fillna("Other")

In [None]:
file_path = './mappings/sites_mappings_1st_round.csv'

with open(file_path, 'w') as fp:
    df2.to_csv(fp, sep=';')

In [None]:
df2_second = df2[df2.site_mapping.isna()]

In [None]:
df2_second

In [None]:
counter = 0
good_index = []
for i in range(len(results)):
    category = json.loads(results[i]['response']['body']['choices'][0]['message']['content'])['Category']
    if len(category) == 2:
        good_index.append(i)
        counter += 1
print(f"Total count = {counter}")

In [None]:
# Reading only the first results
for res in results[:5]:
    # Getting index from task id
    index = task_id.split('-')[-1]
    result = res['response']['body']['choices'][0]['message']['content']
    movie = df.iloc[int(index)]
    description = movie['Overview']
    title = movie['Series_Title']
    print(f"TITLE: {title}\nOVERVIEW: {description}\n\nRESULT: {result}")
    print("\n\n----------------------------\n\n")

In [None]:
category = json.loads(results[555]['response']['body']['choices'][0]['message']['content'])

In [None]:
all_results, good_index = [], []
for index in range(len(results)):
    try:
        category = json.loads(results[index]['response']['body']['choices'][0]['message']['content'])
        all_results.append(category)
        good_index.append(index)
    except Exception as e:
        print(f"Bad Example: {task['response']['body']['choices'][0]['message']['content']}")

In [None]:
index

In [None]:
import pandas as pd
import numpy as np

num_sites = 50

df2["site_mapping"] = np.nan

# Populate the 'mapping' column
for i, idx in enumerate(good_index):
    start = idx * num_sites
    end = (idx + 1) * num_sites
    print(len(all_results[i]["Category"]))
    df2.loc[start:end-1, "site_mapping"] = all_results[i]["Category"]

# Display the result
print(df2.head(300))  # Adjust range as needed to view more rows


### apps analytics

In [None]:
from pathlib import Path
import pandas as pd

path_file = Path('./mappings/unique_apps.csv')

with open(path_file, 'r') as fp:
    df_apps = pd.read_csv(fp)

df_apps.head(30)

In [None]:
categories_list = ", ".join(df1["Category"])

categorize_system_prompt = f'''
Your goal is to categorize a list of apps based on their general purpose and usage.
You will be provided with a single string containing app names separated by commas.
You will output a JSON object containing a single key, "Category", with its value as a list of categories.
Each category in the list should correspond to a website in the input string, in the same order.

It is crucial that every website in the input string is assigned a category in the output list. Do not skip any websites, and ensure the output list has a one-to-one mapping for each website provided.

Each input website should be mapped to one of the predefined categories below:
{categories_list}

The JSON object should have the format:

{{
    "Category": [category_1, category_2, ...] // Array of categories corresponding to each website
}}

The "Category" list should contain a single category for each website in the input string, based on the order provided, and only use categories from the given list.
Do not create any new categories outside of those specified.
'''

In [None]:
from pathlib import Path
import pandas as pd

path_file = Path('./data/150_users_dataset.csv')

with open(path_file, 'r') as fp:
    df_apps = pd.read_csv(fp, sep=';')

df_apps.head(30)

### All data preprocessing/mappings

In [None]:
from pathlib import Path
import pandas as pd

path_file = Path('./data/150_users_dataset.csv')

with open(path_file, 'r') as fp:
    df_apps = pd.read_csv(fp, sep=';')

df_apps.head(30)

### Double Linked Lists

In [None]:
class DoubleLinked:
    def __init__(self, val, next = None, prev = None):
        self.val = val
        self.next = next
        self.prev = prev

    def __str__(self):
        return str(self.val)

head = tail =DoubleLinked(1)

def add_on_the_beggining(val, head, tail):
    new_node = DoubleLinked(val, next=head)
    head.prev = new_node
    return new_node, tail

def add_on_the_end(val, head, tail):
    new_node = DoubleLinked(val, next=None, prev=tail)
    tail.next = new_node
    return head, new_node
    

### Binary Search

In [None]:
# 1st scenario

A = [-7, -4, 0, 1, 5, 9]

def binary_search(arr, num):
    N = len(arr)
    L = 0
    R = N - 1

    while L <= R:

        M = L + ((R - L) // 2)
        if arr[M] == num:
            return True
        elif arr[M] > num:
            R = M - 1
        else:
            L = M + 1
    return False

B = [True, True, True, True, False, False]

def binary_search_2(arr, num):
    N = len(arr)
    L = 0
    R = N - 1

    while L < R:
        M = L + ((R - L) // 2)

        if not B[M]:
            R = M
        else:
            L = M + 1
    return L

### Binary Trees & Binary Search Trees

In [None]:
# Postorder traversal - recursion

def post_order(node):
    if not node:
        return

    post_order(node.left)
    post_order(node.right)
    print(node.val)

# Postorder - Stack implementatio
def post_order_st(node):
    stack = [node]
    while stack:
        node = stack.pop()

        print(node.val)
        if node.right: stack.append(node.right)
        if node.left: stack.append(node.left)

In [None]:
from collections import deque
# BFS
def bfs(node):
    queue = deque()
    queue.append(node)
    
    while queue:
        node = queue.popleft()
        queue.append(node.left)
        queue.append(node.right)
        print(node.val)
       
# Check if a value exist ina  tree (DFS)
def search(node, target):
    if not node:
        return
    if node.val == target:
        return True
    return search(node.left) or search(node.right)