# Text - Load Documents into Database

This notebook implements ETL - Extract, Transform, Load - to get our documents into a sqlite database

# Imports

In [12]:
import pandas as pd
from github import Github
import requests
from datetime import datetime, timezone
import json
import os
import re
import json
import csv
from github import GithubException
from requests.exceptions import RetryError
from urllib3.exceptions import MaxRetryError
from retrying import retry
import sqlite3
import pandas as pd

# Extract - Load the data

In [13]:
# Load the CSV file into a pandas DataFrame
file_path = 'data/combined.csv'
df = pd.read_csv(file_path)

In [14]:
# Display the first few rows of the DataFrame to verify the data
df.head(3).T

Unnamed: 0,0,1,2
Talk ID,1,2,3
Title,Applying AI with Python,Harnessing the Power of Community: Lessons fro...,It’s About Time: Time-Series Forecasting with ...
Abstract,Artificial Intelligence (AI) is transforming i...,"Speedrunning, the art of completing video game...","Along with the rise of “AI”, data-driven decis..."
Speaker Name,Alex Conway,Nunudzai Mrewa,Brenden Taylor
Speaker Profile URL,https://2024.za.pycon.org/users/alxcnwy/,https://2024.za.pycon.org/users/JustNunuz/,https://2024.za.pycon.org/users/brenden.taylor22/
Talk URL,https://2024.za.pycon.org/talks/11-applying-ai...,https://2024.za.pycon.org/talks/13-harnessing-...,https://2024.za.pycon.org/talks/19-its-about-t...
Speaker ID,1,2,3
Speaker ID.1,1,2,3
Name,Alex Conway,Nunudzai Mrewa,Brenden Taylor
Profile URL,https://2024.za.pycon.org/users/alxcnwy/,https://2024.za.pycon.org/users/JustNunuz/,https://2024.za.pycon.org/users/brenden.taylor22/


# Join topics

In [15]:
# Load the topics file into pandas DataFrames
file_path_topics = 'data/topics.csv'
df_topics = pd.read_csv(file_path_topics)

# Merge the two DataFrames on 'Talk ID', and rename the 'Assigned Topic' column to 'Topic'
df = pd.merge(df, df_topics, on='Talk ID', how='left').rename(columns={'Assigned Topic': 'Topic'})

In [16]:
# Display the first few rows of the merged DataFrame to verify the data
df.head(3).T

Unnamed: 0,0,1,2
Talk ID,1,2,3
Title,Applying AI with Python,Harnessing the Power of Community: Lessons fro...,It’s About Time: Time-Series Forecasting with ...
Abstract,Artificial Intelligence (AI) is transforming i...,"Speedrunning, the art of completing video game...","Along with the rise of “AI”, data-driven decis..."
Speaker Name,Alex Conway,Nunudzai Mrewa,Brenden Taylor
Speaker Profile URL,https://2024.za.pycon.org/users/alxcnwy/,https://2024.za.pycon.org/users/JustNunuz/,https://2024.za.pycon.org/users/brenden.taylor22/
Talk URL,https://2024.za.pycon.org/talks/11-applying-ai...,https://2024.za.pycon.org/talks/13-harnessing-...,https://2024.za.pycon.org/talks/19-its-about-t...
Speaker ID,1,2,3
Speaker ID.1,1,2,3
Name,Alex Conway,Nunudzai Mrewa,Brenden Taylor
Profile URL,https://2024.za.pycon.org/users/alxcnwy/,https://2024.za.pycon.org/users/JustNunuz/,https://2024.za.pycon.org/users/brenden.taylor22/


Now we have topics too

# Transform - Enrich our data by scraping github and twitter

In [9]:
token = "github_pat_11ACB..."

```
{
    "username": "alxcnwy",
    "name": "Alex Conway",
    "public_repos": 18,
    "total_stars": 104,
    "most_starred_repo": {
        "name": "Deep-Neural-Networks-for-Video-Classification",
        "stars": 44
    },
    "last_commit_datetime": "2024-10-02T11:26:04+00:00",
    "hours_since_last_commit": 0.5807980566666667,
    "followers": 44,
    "following": 15,
    "created_at": "2014-09-02T18:24:14+00:00",
    "company": null,
    "bio": "making something people want | \u27e0 \u20bf | \r\nycombinator\r\n W22 | won awards for AI projects from companies like Mercedes-Benz, NTT Japan, etc",
    "location": "Cape Town",
    "blog": "www.numberboost.com",
    "email": null
}

```

## Scrape all github data where available

In [26]:
# Directory path where .txt files are located
data_directory = 'data/combined/'
output_csv_path = 'data/github.csv'

# Function to extract data from each .txt file
def extract_data_from_txt(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    
    # Extract Speaker ID
    speaker_id_match = re.search(r"Speaker ID: (\d+)", content)
    speaker_id = speaker_id_match.group(1) if speaker_id_match else None
    
    # Extract GitHub URL (first URL that matches GitHub pattern)
    github_url_match = re.search(r"(https://github\.com/[a-zA-Z0-9_-]+)", content)
    github_url = github_url_match.group(1) if github_url_match else None
    
    # Return the extracted data
    return {
        "speaker_id": speaker_id,
        "github_url": github_url
    }

# Retry function with exponential backoff for handling API request errors
@retry(stop_max_attempt_number=1, wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_github_profile_data(profile_url):
    print("Scraping github for:", profile_url)
    try:
        # Extract username from profile URL
        username = profile_url.rstrip('/').split('/')[-1]
        
        # Get user information
        user = g.get_user(username)
        
        # Fetch all repos of the user
        repos = user.get_repos()
        
        # Data collection variables
        last_commit = None
        hours_since_last_commit = None
        total_stars = 0
        most_starred_repo = None
        most_stars = 0
        num_repos = repos.totalCount
        
        # Process repositories
        for repo in repos:
            total_stars += repo.stargazers_count
            if repo.stargazers_count > most_stars:
                most_stars = repo.stargazers_count
                most_starred_repo = repo
            
            # Find the latest commit
            try:
                commits = repo.get_commits()
                if commits.totalCount > 0:
                    latest_commit = commits[0].commit.author.date
                    if last_commit is None or latest_commit > last_commit:
                        last_commit = latest_commit
            except GithubException as e:
                # Skip repositories with issues (e.g., empty repositories)
                print(f"Skipping repository {repo.name} due to error: {e.data.get('message', str(e))}")
                continue
        
        # Calculate hours since last commit if available
        if last_commit:
            now = datetime.now(timezone.utc)
            hours_since_last_commit = (now - last_commit).total_seconds() / 3600

        # Prepare output as JSON
        data = {
            "username": user.login,
            "name": user.name,
            "public_repos": num_repos,
            "total_stars": total_stars,
            "most_starred_repo": {
                "name": most_starred_repo.name if most_starred_repo else None,
                "stars": most_stars
            },
            "last_commit_datetime": last_commit.isoformat() if last_commit else None,
            "hours_since_last_commit": hours_since_last_commit,
            "followers": user.followers,
            "following": user.following,
            "created_at": user.created_at.isoformat(),
            "company": user.company,
            "bio": user.bio,
            "location": user.location,
            "blog": user.blog,
            "email": user.email
        }

        return data
    except (GithubException, RetryError, MaxRetryError) as e:
        # Handle API-related errors and skip problematic profiles
        print(f"Error processing profile {profile_url}: {str(e)}")
        return None

# Function to process all .txt files and get the combined data
def process_all_txt_files(directory):
    combined_data = []
    
    # Loop through all files in the specified directory
    for i, filename in enumerate(os.listdir(directory)):
        print(f"{i}/{len(os.listdir(directory))}")
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            # Extract speaker ID and GitHub URL from the current file
            speaker_data = extract_data_from_txt(file_path)
            # Get GitHub profile data if GitHub URL is found
            if speaker_data["github_url"]:
                profile_data = get_github_profile_data(speaker_data["github_url"])
                speaker_data["github_profile_data"] = profile_data
            else:
                speaker_data["github_profile_data"] = None
            
            # Add to combined data if the profile data is successfully fetched
            combined_data.append(speaker_data)
    
    return combined_data

# Function to save the combined data to a CSV file
def save_to_csv(data, csv_file_path):
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['speaker_id', 'github_url', 'username', 'name', 'public_repos', 'total_stars', 'most_starred_repo_name', 'most_starred_repo_stars', 'last_commit_datetime', 'hours_since_last_commit', 'followers', 'following', 'created_at', 'company', 'bio', 'location', 'blog', 'email']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for item in data:
            github_data = item.get("github_profile_data", {})
            if github_data:  # Ensure the GitHub data is not None
                writer.writerow({
                    'speaker_id': item.get('speaker_id'),
                    'github_url': item.get('github_url'),
                    'username': github_data.get('username'),
                    'name': github_data.get('name'),
                    'public_repos': github_data.get('public_repos'),
                    'total_stars': github_data.get('total_stars'),
                    'most_starred_repo_name': github_data.get('most_starred_repo', {}).get('name'),
                    'most_starred_repo_stars': github_data.get('most_starred_repo', {}).get('stars'),
                    'last_commit_datetime': github_data.get('last_commit_datetime'),
                    'hours_since_last_commit': github_data.get('hours_since_last_commit'),
                    'followers': github_data.get('followers'),
                    'following': github_data.get('following'),
                    'created_at': github_data.get('created_at'),
                    'company': github_data.get('company'),
                    'bio': github_data.get('bio'),
                    'location': github_data.get('location'),
                    'blog': github_data.get('blog'),
                    'email': github_data.get('email')
                })

# Main code execution
all_speaker_data = process_all_txt_files(data_directory)

# Save the data to a CSV file
save_to_csv(all_speaker_data, output_csv_path)

# Print the JSON result
print(json.dumps(all_speaker_data, indent=4))


0/35
Scraping github for: https://github.com/sixfeetup
1/35
Scraping github for: https://github.com/gijzelaerr
Skipping repository gh_repo_test due to error: Git Repository is empty.
Skipping repository owlcat due to error: Git Repository is empty.
Skipping repository pages-test due to error: Git Repository is empty.
Skipping repository playground due to error: Git Repository is empty.
Skipping repository purify-debian due to error: Git Repository is empty.
2/35
3/35
4/35
5/35
6/35
7/35
Scraping github for: https://github.com/luisdza
8/35
Scraping github for: https://github.com/singhsegv
9/35
Scraping github for: https://github.com/drnlm
10/35
Scraping github for: https://github.com/Divya063
Skipping repository cassandra_notes due to error: Git Repository is empty.
Skipping repository Data-Science due to error: Git Repository is empty.
Skipping repository lsmkv due to error: Git Repository is empty.
Skipping repository twitter-clone due to error: Git Repository is empty.
11/35
Scraping

# Create final .txt documents with all the enriched data

In [28]:
dfg = pd.read_csv("data/github.csv")
dfg

Unnamed: 0,speaker_id,github_url,username,name,public_repos,total_stars,most_starred_repo_name,most_starred_repo_stars,last_commit_datetime,hours_since_last_commit,followers,following,created_at,company,bio,location,blog,email
0,8,https://github.com/sixfeetup,sixfeetup,Six Feet Up,127,210,scaf,79,2024-09-25T17:04:43+00:00,164.035416,19,0,2011-09-08T20:39:11+00:00,,"Accelerating IMPACT with App Dev, AI & Big Data","Fishers, IN",http://www.sixfeetup.com,info@sixfeetup.com
1,20,https://github.com/gijzelaerr,gijzelaerr,Gijs Molenaar,172,748,python-snap7,648,2024-09-17T16:20:59+00:00,356.831831,188,111,2010-07-08T11:22:39+00:00,spotify,"astro software composer, music brewer, beer pr...",Amsterdam - Windhoek,http://pythonic.nl,gijsmolenaar@gmail.com
2,21,https://github.com/luisdza,luisdza,Luis de Sousa,8,3,aerialmzansi-website,2,2024-08-22T13:24:29+00:00,983.777285,23,112,2014-04-12T05:41:28+00:00,@Syeop,Creating and dreaming,"Johannesburg, South Africa",,
3,22,https://github.com/singhsegv,singhsegv,Rajdeep,56,51,kitaab_bot,12,2024-09-23T09:53:19+00:00,219.319172,63,209,2015-03-13T17:19:37+00:00,,"Software Engineering, Robotics and Computer Vi...",Bangalore,singhsegv.github.io,iamrajdeep1008@gmail.com
4,32,https://github.com/drnlm,drnlm,Neil Muller,30,5,matplotlib-py3,1,2024-09-29T14:11:56+00:00,71.021116,7,0,2011-02-28T09:19:10+00:00,,,"Cape Town, South Africa",,
5,26,https://github.com/Divya063,Divya063,Divya Rani,190,17,distributedKV,4,2024-08-29T13:34:16+00:00,815.73507,134,82,2016-08-24T09:28:22+00:00,,\r\n Contributor @kubernetes-sigs | CKA | G...,,,
6,25,https://github.com/adeline-pepela,adeline-pepela,Adeline Makokha,39,0,,0,2024-09-16T12:50:07+00:00,384.48877,13,9,2021-06-08T13:20:20+00:00,,"Software Developer, 2X AWS Certified, Telecomm...",Kenya,,
7,31,https://github.com/sheenarbw,sheenarbw,sheenarbw,76,22,tutorial-airflow,6,2024-10-02T13:00:38+00:00,0.347602,77,12,2011-03-30T11:39:05+00:00,,,,https://sheenaoc.com,
8,17,https://github.com/sheenarbw,sheenarbw,sheenarbw,76,22,tutorial-airflow,6,2024-10-02T13:00:38+00:00,0.376927,77,12,2011-03-30T11:39:05+00:00,,,,https://sheenaoc.com,
9,29,https://github.com/czue,czue,Cory Zue,67,1016,celery-progress,464,2024-09-26T13:58:03+00:00,143.448079,343,41,2009-03-24T15:34:08+00:00,"SaaS Pegasus, Dimagi",Web developer and product maker. @dimagi,"Cape Town, South Africa",http://www.coryzue.com/,cory@coryzue.com


In [31]:
import os
import pandas as pd

# Define the input directory with the existing .txt files and the dataframe dfg
input_dir = 'data/combined/'
output_dir = 'data/documents/'

# Ensure the output directory exists (in case you want to save a copy there)
os.makedirs(output_dir, exist_ok=True)

# Function to extract Speaker ID from a given .txt file
def extract_speaker_id_from_txt(filepath):
    with open(filepath, 'r') as file:
        for line in file:
            if line.startswith("Speaker ID:"):
                return int(line.split(":")[1].strip())  # Extract and return the Speaker ID
    return None

# Loop through all .txt files in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)
        
        # Extract Speaker ID from the .txt file
        speaker_id_in_file = extract_speaker_id_from_txt(file_path)
        
        # Find matching row in the dataframe based on speaker_id
        matching_row = dfg[dfg['speaker_id'] == speaker_id_in_file]
        
        if not matching_row.empty:
            row = matching_row.iloc[0]  # Get the first (and only) matching row
            
            # Append Github data to the existing file
            with open(file_path, 'a') as f:
                f.write("\n--- Github ---\n")
                for column in dfg.columns:
                    value = row[column]
                    if pd.isna(value):
                        value = ""  # Replace NaN values with empty strings
                    f.write(f"{column}: {value}\n")


# Load documents into vector store

In [3]:
openai_api_key = "sk-proj-aXO5P8Fe_i0K6yeE..."

## load vectorstore

In [4]:
import os
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Set your OpenAI API key
openai.api_key = openai_api_key

# Path to your documents folder
docs_folder = "data/documents/"

# Initialize OpenAI embeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Read all text files from the directory
documents = []
for filename in os.listdir(docs_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(docs_folder, filename), "r") as file:
            text = file.read()
            doc = Document(page_content=text)
            documents.append(doc)

# Embed the documents and store embeddings in FAISS
vectorstore = FAISS.from_documents(documents, embedding_model)

# Save FAISS index
vectorstore.save_local("faiss_index")


  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")


# Load structured data into SQLite

In [23]:
# Normalize column names function
def normalize_column_names(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    return df

# Load CSV files into Pandas DataFrames
github_df = pd.read_csv('data/github.csv')
speakers_df = pd.read_csv('data/speakers.csv')
talks_df = pd.read_csv('data/talks.csv')
topics_df = pd.read_csv('data/topics.csv')

# Normalize column names
github_df = normalize_column_names(github_df)
speakers_df = normalize_column_names(speakers_df)
talks_df = normalize_column_names(talks_df)
topics_df = normalize_column_names(topics_df)

# Create a SQLite connection
conn = sqlite3.connect('data/conference_data.db')

# Save the dataframes to SQLite
github_df.to_sql('github', conn, if_exists='replace', index=False)
speakers_df.to_sql('speakers', conn, if_exists='replace', index=False)
talks_df.to_sql('talks', conn, if_exists='replace', index=False)
topics_df.to_sql('topics', conn, if_exists='replace', index=False)

36

## Test Query Database

In [21]:
# Create a SQLite connection
conn = sqlite3.connect('data/conference_data.db')

# Example join queries
with conn:
    # Query to get speaker details along with their GitHub information
    query1 = '''
    SELECT s.name, s.twitter, s.github, g.username, g.public_repos, g.followers
    FROM speakers s
    JOIN github g ON s.github = g.github_url
    LIMIT 5;
    '''
    result1 = pd.read_sql_query(query1, conn)
    print("Speaker GitHub Information:")
    print(result1)

    # Query to get talks and their associated topics
    query2 = '''
    SELECT t.title, t.abstract, p.assigned_topic
    FROM talks t
    JOIN topics p ON t.talk_id = p.talk_id
    LIMIT 5;
    '''
    result2 = pd.read_sql_query(query2, conn)
    print("\nTalks with Topics:")
    print(result2)

    # Query to get talks and their associated speakers
    query3 = '''
    SELECT t.title, t.abstract, s.name
    FROM talks t
    JOIN speakers s ON t.speaker_id = s.speaker_id
    LIMIT 5;'''
    result3 = pd.read_sql_query(query3, conn)
    print("\nTalks with Speakers:")
    print(result3)

Speaker GitHub Information:
             name                      twitter  \
0     Alex Conway  https://twitter.com/alxcnwy   
1  Nunudzai Mrewa                         None   
2  Ruan Pretorius                         None   
3   Schalk Venter                         None   
4  Shaun De Ponte                         None   

                            github      username  public_repos  followers  
0       https://github.com/alxcnwy       alxcnwy            18         44  
1     https://github.com/JustNunuz     JustNunuz            10          4  
2       https://github.com/ruankie       ruankie            71         19  
3  https://github.com/schalkventer  schalkventer           171        365  
4     https://github.com/nawtybean     nawtybean            16          5  

Talks with Topics:
                                               title  \
0                            Applying AI with Python   
1  Harnessing the Power of Community: Lessons fro...   
2  It’s About Time: Time-Se