<a href="https://colab.research.google.com/github/ktynski/Marketing_Automations_Notebooks_With_GPT/blob/main/Automatic_Deep_TikTok_Insights_with_GPT_and_Whisper_Public.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install apify-client
!pip install spleeter openai tensorflow
!pip install typer click==8.0.4
!pip install pydub

In [None]:
!pip install --upgrade httpx apify-client


In [None]:
import pandas as pd
import concurrent.futures
import threading
from apify_client import ApifyClient
import time

APIFY_API_URL = 'https://api.apify.com/v2'
ACTOR_NAME = 'mscraper/tiktok-search-autocomplete'
APIFY_API_KEY = 'Your Apify API Key'

client = ApifyClient(APIFY_API_KEY)

# Create a semaphore with a maximum of 12 concurrent threads
semaphore = threading.Semaphore(12)
df = pd.DataFrame(columns=["search_term", "suggestion", "lang"])

def get_autocomplete(search):
    run_input = {
        "proxy": {
            "useApifyProxy": True,
            "apifyProxyCountry": "US",
            "apifyProxyGroups": ["RESIDENTIAL"]
        },
        "query": [search]
    }

    run = client.actor(ACTOR_NAME).call(run_input=run_input)
    run_id = run.get('id')

    # Check the status every 5 seconds
    while True:
        details = client.run(run_id).wait_for_finish()
        if details['status'] == 'SUCCEEDED':
            break
        time.sleep(5)

    global df
    suggestions = []

    for item in client.dataset(details["defaultDatasetId"]).iterate_items():
        # add to dataframe
        df = df.append({"search_term": search, "suggestion": item['content'], "lang": item['lang']}, ignore_index=True)
        suggestions.append(item['content'])

    return suggestions

def recursive_search(search_term, level=1, max_level=3):
    if level > max_level:
        return

    suggestions = get_autocomplete(search_term)

    # Create a list to store futures
    futures = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for suggestion in suggestions:
            # Use the semaphore to limit the number of active threads
            semaphore.acquire()

            try:
                futures.append(executor.submit(recursive_search, suggestion, level + 1, max_level))
            finally:
                semaphore.release()

        for future in concurrent.futures.as_completed(futures):
            future.result()

recursive_search('Camping', max_level=2)
df = df.drop_duplicates(subset='suggestion')

print(df)


In [None]:
df.to_csv('scraped_tiktok_searches.csv')

fordl = df
fordl

In [None]:
import pandas as pd
import concurrent.futures
import threading
from apify_client import ApifyClient

APIFY_API_URL = 'https://api.apify.com/v2'
ACTOR_NAME = 'clockworks/tiktok-scraper'
APIFY_API_KEY = 'Your Apify API Key'

client = ApifyClient(APIFY_API_KEY)

# Create a semaphore with a maximum of 32 concurrent threads
semaphore = threading.Semaphore(24)

def run_actor(hashtag):
    run_input = {
        "hashtags": [hashtag],
        "resultsPerPage": 25,
        "scrapeEmptyChannelInfo": False,
        "shouldDownloadVideos": True,
        "shouldDownloadCovers": False,
        "videoKvStoreIdOrName": "mytiktokvideos",
        "proxyConfiguration": { "useApifyProxy": True },
    }

    run = client.actor(ACTOR_NAME).call(run_input=run_input)
    results = []

    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        result = item  # Store the entire response
        result['hashtag'] = hashtag  # Add the used hashtag
        results.append(result)

    return results

def concurrent_runs(df):
    hashtags = df['suggestion'].tolist()
    all_results = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for hashtag in hashtags:
            # Use the semaphore to limit the number of active threads
            semaphore.acquire()
            try:
                futures.append(executor.submit(run_actor, hashtag))
            finally:
                semaphore.release()

        for future in concurrent.futures.as_completed(futures):
            all_results.extend(future.result())

    return pd.DataFrame(all_results)

# Drop rows with NaN values
df = df.dropna()

# Or replace NaNs with a placeholder value
df = df.fillna('')

# Then run the function
df = df[:50]
new_df = concurrent_runs(df)
new_df.to_csv('tiktokscrape.csv')
print(new_df)




In [None]:
new_df.to_csv('tiktokscrape.csv')

In [None]:
import concurrent.futures
import openai
import requests
import pandas as pd
import os
import shutil
from spleeter.separator import Separator
from pydub import AudioSegment
import ast
import threading

new_df = pd.read_csv('tiktokscrape.csv')
df = new_df

openai.api_key = "Your OpenAI API Key"

# Create a separator with 2 stems (vocals and accompaniment)
separator = Separator('spleeter:2stems')
separator_lock = threading.Lock()

# Modify download_and_transcribe function
def download_and_transcribe(row):
    video_url = ast.literal_eval(row['mediaUrls'])[0]
    filename = f"/content/{row['id']}.mp4"

    # Download the video
    with requests.get(video_url, stream=True) as r:
        r.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    # Convert mp4 to mp3 for audio separation
    audio_filename = f"/content/{row['id']}.mp3"
    os.system(f'ffmpeg -i {filename} -vn -ar 44100 -ac 2 -b:a 192k {audio_filename}')

    # Separate the vocal from music
    with separator_lock:
        separator.separate_to_file(audio_filename, f'/content/{row["id"]}')
    os.remove(audio_filename)

    # Convert wav to mp3
    audio = AudioSegment.from_wav(f"/content/{row['id']}/{row['id']}/vocals.wav")
    audio.export(f"/content/{row['id']}/vocals.mp3", format="mp3")

    # Transcribe the vocal
    vocal_filename = f"/content/{row['id']}/vocals.mp3"
    with open(vocal_filename, "rb") as vocal_file:
        transcript = openai.Audio.translate("whisper-1", vocal_file)

    # Clean up the downloaded files
    os.remove(filename)
    shutil.rmtree(f"/content/{row['id']}")  # remove directory created by spleeter

    return (row['id'], transcript["text"])



# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Submit all tasks to the executor
    future_to_row = {executor.submit(download_and_transcribe, row[1]): row[0] for row in df.iterrows()}

    # Collect the results as they become available
    for future in concurrent.futures.as_completed(future_to_row):
        i = future_to_row[future]
        try:
            id, transcription = future.result()
            df.loc[i, 'transcription'] = transcription
        except Exception as exc:
            print(f'Row {i} generated an exception: {exc}')

df.to_csv('transcribed.csv')

In [None]:
def gpt_parse_tiktok_transcriptions(df, chunk_size=10, retries=3):
    # Create a container to store the parsed transcriptions
    parsed_transcriptions = []

    # Chunk the DataFrame
    chunks = [df[i:i+chunk_size] for i in range(0, df.shape[0], chunk_size)]

    for chunk in chunks:
        # Prepare the data for the batch
        batch_data = []

        for _, row in chunk.iterrows():
            # Unpack the row
            hashtags, tiktok_transcription, created_time, sharecount, playcount, commentcount, mentions, effect_stickers, search_hashtags = row['hashtags'], row['transcription'], row['createTime'], row['shareCount'], row['playCount'], row['commentCount'], row['mentions'], row['effectStickers'], row['searchHashtag']

            # Prepare the row data
            row_data = f"""
            Transcription: {tiktok_transcription}
            Hashtags: {hashtags}
            View Count: {playcount}
            Share Count: {sharecount}
            Comment Count: {commentcount}
            Mentions: {mentions}
            Effect Stickers: {effect_stickers}
            Search Hashtags: {search_hashtags}
            """

            batch_data.append(row_data)

        for _ in range(retries):
            # Provide GPT with the system message to instruct it on the task
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo-16k",
                messages=[
                        {"role": "system",
                                "content": """You're an AI with advanced capabilities in understanding and interpreting social media content. Your current task involves processing a batch of up to 100 transcriptions from TikTok videos, each with its associated hashtags, view counts, and comment counts.
                                Your objective is to uncover valuable and actionable insights for a content marketer who aims to create impactful and resonant content on TikTok. Give specific examples from the data/transcripts to illustrate your points wherever possible.

                                Pay close attention to these specific areas in the data provided. Do not provide generically true advice, your advice should be specific to the data you are provided with and provide specific examples from the data that illustrate your point:

                                - Virality Factors: Identify unique characteristics of videos that went viral. What aspects of their content or engagement metrics could have contributed to their virality? Give at least 5 examples to illustrate.

                                - Community Engagement: Evaluate the elements of videos that have high community engagement. What strategies, themes or content types incite audience participation? Give at least 5 examples to illustrate.

                                - Story Arc: For high performing videos, what is the arc of the story like? Provide specific examples.

                                - Audience Segmentation: Deduce potential audience segments based on their reactions to different videos. What content preferences does each audience segment seem to display? Give at least 5 examples to illustrate.

                                - Effective Calls to Action: Identify strategies that lead to effective calls to action within the TikTok community. What type of calls to action are generally successful? Give at least 5 examples to illustrate.

                                - Predicting Engagement: Develop an understanding of the factors that might lead to high engagement in the future. What emerging trends or behaviours do you anticipate? Give at least 5 examples to illustrate.

                                - Viewer Retention: Analyze the strategies used to retain viewer attention throughout a video. What elements contributed to viewers watching the videos till the end? Give at least 5 examples to illustrate.

                                - Emotional Engagement: Identify the types of content that elicit strong emotional responses given the data and transcripts. Give at least 5 examples to illustrate.

                                - Brand Presence: Evaluate the balance between brand presence and audience enjoyment. How does the overt presence of a brand in the videos influence engagement? List all brands mentioned.

                                - Participation in Trends: Understand the drivers behind user participation in challenges or trends. What factors encourage users to join in? List any challenges or trends found.

                                - Authenticity: Analyze the role of authenticity in content reception. How important is it for the content to be perceived as genuine? Give at least 5 examples to illustrate.

                                - Sentiment Analysis: Determine what factors contributed to positive or negative sentiment towards a piece of content. What elements seem to sway sentiment in either direction? Give at least 5 examples to illustrate.

                                Remember, each insight should be backed by the data provided with as many specific examples as possible, and focus on providing highly detailed analysis of the content provided. Your analysis should help them understand all of the important insights of the data."""},
                                {"role": "user", "content": f"Here are the TikTok transcriptions and associated data for analysis: \n\n" + "\n\n".join(batch_data)}


                ],
                max_tokens=4000,
                n=1,
                stop=None,
                temperature=0.7
            )

            # If the API call was successful, extract the parsed transcription and add it to the list
            if response is not None and 'choices' in response and len(response['choices']) > 0:
                parsed_transcription = response['choices'][0]['message']['content'].strip()
                print(parsed_transcription)
                parsed_transcriptions.append(parsed_transcription)
                break

    # Create a DataFrame from the list of parsed transcriptions
    parsed_df = pd.DataFrame(parsed_transcriptions, columns=['parsed_transcription'])

    # Save the parsed transcriptions to a CSV file
    parsed_df.to_csv('parsed_transcriptions.csv', index=False)


    return parsed_df




df = pd.read_csv('transcribed.csv')

# Run the GPT analysis on the transcriptions
parsed_df = gpt_parse_tiktok_transcriptions(df, chunk_size=10, retries=3)

# Print the parsed DataFrame
print(parsed_df)






In [None]:
def gpt_secondary_analysis_batched(df, chunk_size=4):
    # Create a container to store the secondary analysis results
    secondary_analysis_results = []

    # Chunk the DataFrame
    chunks = [df[i:i+chunk_size] for i in range(0, df.shape[0], chunk_size)]

    for chunk in chunks:
        aggregated_transcriptions = ' '.join(chunk['parsed_transcription'])

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-16k",
            messages=[
                {"role": "system",
                 "content": "Summarize the findings and provide a readout that includes all salient info. Always provide specific examples from the transcripts"},
                {"role": "user", "content": f"Provide the readout: {aggregated_transcriptions}."}
            ],
            max_tokens=1500,
            n=1,
            stop=None,
            temperature=0.7
        )

        if response is not None and 'choices' in response and len(response['choices']) > 0:
            secondary_analysis = response['choices'][0]['message']['content'].strip()
            secondary_analysis_results.append(secondary_analysis)

    # After analyzing each chunk, analyze all the results together
    aggregated_results = ' '.join(secondary_analysis_results)

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[
            {"role": "system",
             "content": "Simulate an award winning TikTok social media expert and consultant known for highly actionable and insightful analysis of tiktok niches. Please provide a long-form analysis and client deliverable that provides a readout that includes all salient info that would be useful or interesting. Try to provide up to ten specific examples for each.  Make this as in-depth and long-form as possible."},
            {"role": "user", "content": f"Analyze and provide the longform readout: {aggregated_results}."}
        ],
        max_tokens=10000,
        n=1,
        stop=None,
        temperature=0.7
    )

    if response is not None and 'choices' in response and len(response['choices']) > 0:
        final_analysis = response['choices'][0]['message']['content'].strip()
        return final_analysis

    return None


# Your DataFrame should be loaded or defined before this point, for example:
df = pd.read_csv('parsed_transcriptions.csv')

# Call the function
final_eval = gpt_secondary_analysis_batched(df, chunk_size=6)

# Save the final analysis to a text file
with open('Final_TikTok_Analysis.txt', 'w') as f:
    f.write(str(final_eval))
