In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
PROJECT_PATH = os.getenv("PROJECT_PATH")
import sys
sys.path.append(PROJECT_PATH)

In [3]:
print(PROJECT_PATH)

D:\\UU\\Sem3\\Project\\svt\\src


In [4]:
from postgresql.config.settings import DATABASE_URL
from postgresql.config.db import session
from postgresql.database_models import Authors, Challenges, Music, Posts, PostsChallenges
import pandas as pd
import asyncio
from sqlalchemy.future import select

In [5]:
async def fetch_posts_challenges():
    async with session() as s:
        stmt = select(Posts.id, Posts.description, Challenges.id, Challenges.title, Challenges.hashtag_count).join(
                PostsChallenges, Posts.id == PostsChallenges.post_id).join(
                Challenges, Challenges.id == PostsChallenges.challenge_id)
        
        result = await s.execute(stmt)
        rows = result.fetchall()
        
        data = [dict(row._mapping) for row in rows]
        df = pd.DataFrame(data)
        return df

In [6]:
df = await fetch_posts_challenges()
print(df.head(10))

                    id                                        description  \
0  7434126174831267105  A video of Kamala Harris taking a voter phone ...   
1  7434126174831267105  A video of Kamala Harris taking a voter phone ...   
2  7434126174831267105  A video of Kamala Harris taking a voter phone ...   
3  7434126174831267105  A video of Kamala Harris taking a voter phone ...   
4  7435029475638660394  Kamala Harris received more than $1.2 billion ...   
5  7435029475638660394  Kamala Harris received more than $1.2 billion ...   
6  7435029475638660394  Kamala Harris received more than $1.2 billion ...   
7  7435029475638660394  Kamala Harris received more than $1.2 billion ...   
8  7435029475638660394  Kamala Harris received more than $1.2 billion ...   
9  7401849198678723845                     #Fypp #harris #tkcpage #viral    

               id_1            title  hashtag_count  
0            194782           harris              6  
1          51333784     kamalaharris        

In [7]:
df.rename(columns={"id": "post_id", "id_1": "challenge_id", "title": "challenge_title"}, inplace=True)
df[["post_id", "challenge_id"]] = df[["post_id", "challenge_id"]].map(int)

In [8]:
df[["challenge_title"]] = df[["challenge_title"]].map(str)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   post_id          151 non-null    int64 
 1   description      151 non-null    object
 2   challenge_id     151 non-null    int64 
 3   challenge_title  151 non-null    object
 4   hashtag_count    151 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 6.0+ KB


In [None]:
df.to_csv("posts_challenges.csv", index=False)

In [None]:
async def search_by_hashtag(hashtag: str):
    async with session() as s:
        query = (
            select(Posts.id, Posts.description)
            .join(PostsChallenges, Posts.id == PostsChallenges.post_id)
            .join(Challenges, PostsChallenges.challenge_id == Challenges.id)
            .filter(Challenges.title == hashtag)
        )
        
        result = await s.execute(query)
        results = result.all() # .scalars()
        return results

In [17]:
hashtag_name = "harris"
search = await search_by_hashtag(hashtag=hashtag_name)
for s in search:
    print(s)

('7434126174831267105', "A video of Kamala Harris taking a voter phone call went viral after some social media users said she ‘mistakenly’ showed her phone was open on the ca ... (41 characters truncated) ... ed' with some Trump supporters pointing out that you can in fact stay on a call while using the camera.  #harris #kamalaharris #uselection #phonecall")
('7401849198678723845', '#Fypp #harris #tkcpage #viral ')
('7323729957165763846', 'astagfirullah harris😭👍 buka bajunya anyg😭 #harriscaine #vtuber #harris #vtuberindonesia #vtuberedit #nayach #AKAvirtual #sol4ce ')
('7413384929469304096', 'Debata w USA. Harris do Trumpa: Oddałbyś Polskę Putinowi za przysługę #polityka #usa #harris #trump2024 #stanyzjednoczone #polska #usa #putin #rosja #ukraina #nato #europa')
('7394160903677054239', 'Kamala Harris is the happiest lady in the world right now #KamalaHarris #Harris #PresidentHarris #Harris2024 #CallHarris2024 ')
('7378766522669468933', 'gws tante raja 😘😚 #fyp #harris #raja #harrisraja

In [15]:
transactions = df.groupby("post_id")["challenge_title"].apply(list).to_list()
print(transactions)

[['fypシ'], ['fypシ', 'harrystyles', 'foryoupage', 'foryou', 'harries', 'harryfan', 'hshot', 'harrystyleshot', 'meow'], ['kamalaharris', 'vp', 'pride', 'foryourpride', 'ally'], ['harriscaine', 'vtuber', 'harris', 'vtuberindonesia', 'vtuberedit', 'nayach', 'akavirtual', 'sol4ce'], ['kamalaharris', 'vicepresident', 'usa', 'usa_tiktok', 'eeuu', 'unitedstates', 'estadosunidos', 'foryou'], ['xyzbca', 'foryou', 'harry', 'initials', 'names', 'real', 'viral', 'love', 'crush', 'bf', 'harryname', 'foryoupage', 'fyp', 'xoxo', 'fypage', 'viralvideo', 'tiktok', 'haha', 'lolz'], ['fyp', 'harris', 'raja', 'harrisraja'], ['kamalaharris', 'harris', 'presidentharris', 'harris2024', 'callharris2024'], ['foryou', 'fyp', 'usa', 'us', 'celebrities', 'kids', 'children'], ['duet', 'kamalaharris', 'foryoupage', 'paratii', 'fyp', 'viral', 'tiktok', '❤️❤️'], ['republican', 'president', 'vicepresident', 'usa', 'politics', 'jdvance', 'vote2024', 'trump'], ['bitcoin', 'crypto', 'cryptocurrency', 'kamalaharris', 'simp

In [25]:
print(len(transactions))

22


In [16]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
txn_df = pd.DataFrame(te_ary, columns=te.columns_)
print(txn_df.head(10))

      1d  1direction  1dtiktok   2024  akavirtual   ally  america  \
0  False       False     False  False       False  False    False   
1  False       False     False  False       False  False    False   
2  False       False     False  False       False   True    False   
3  False       False     False  False        True  False    False   
4  False       False     False  False       False  False    False   
5  False       False     False  False       False  False    False   
6  False       False     False  False       False  False    False   
7  False       False     False  False       False  False    False   
8  False       False     False  False       False  False    False   
9  False       False     False  False       False  False    False   

   atlantageorgia  barackobama  betterthanwords  ...     vp  vtuber  \
0           False        False            False  ...  False   False   
1           False        False            False  ...  False   False   
2           False        Fa

In [None]:
print(txn_df.shape)

(22, 107)
22


In [22]:
from mlxtend.frequent_patterns import apriori, association_rules
frequent_itemsets = apriori(txn_df, min_support=0.05, use_colnames=True)
print(frequent_itemsets)

     support                          itemsets
0   0.181818                          (foryou)
1   0.181818                      (foryoupage)
2   0.272727                             (fyp)
3   0.090909                            (fypシ)
4   0.272727                          (harris)
5   0.136364                      (harris2024)
6   0.090909                     (harrystyles)
7   0.545455                    (kamalaharris)
8   0.090909                        (politics)
9   0.090909                          (tiktok)
10  0.090909                           (trump)
11  0.090909                       (trump2024)
12  0.090909                    (unitedstates)
13  0.227273                             (usa)
14  0.090909                      (usa_tiktok)
15  0.090909                   (vicepresident)
16  0.136364                           (viral)
17  0.090909              (foryou, foryoupage)
18  0.090909                     (foryou, fyp)
19  0.090909                     (foryou, usa)
20  0.090909 

In [29]:
rules = association_rules(frequent_itemsets, len(txn_df), metric="confidence", min_threshold=0.5)
print(rules)

                    antecedents                consequents  \
0                      (foryou)               (foryoupage)   
1                  (foryoupage)                   (foryou)   
2                      (foryou)                      (fyp)   
3                      (foryou)                      (usa)   
4                  (foryoupage)                      (fyp)   
5                  (foryoupage)             (kamalaharris)   
6                      (tiktok)               (foryoupage)   
7                  (foryoupage)                   (tiktok)   
8                       (viral)               (foryoupage)   
9                  (foryoupage)                    (viral)   
10                     (tiktok)                      (fyp)   
11                      (viral)                      (fyp)   
12                 (harris2024)             (kamalaharris)   
13               (unitedstates)             (kamalaharris)   
14                 (usa_tiktok)             (kamalaharris)   
15      

In [30]:
rules.to_csv("association_rules.csv", index=False)