In [1]:
import os

In [2]:
%pwd

'/mnt/d/Projects/Project Git repos/AutoVideoSectioning/Research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/mnt/d/Projects/Project Git repos/AutoVideoSectioning'

In [5]:
import logging
from pathlib import Path
logging.basicConfig(
    # filename='extract_data.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [6]:
# English Video

In [21]:
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from pydub import AudioSegment
import moviepy.editor as mp

# Function to extract audio from video
def extract_audio_from_video(video_path, audio_path):
    video = mp.VideoFileClip(str(video_path))  # Convert Path to string
    video.audio.write_audiofile(str(audio_path))  # Convert Path to string

# Function to transcribe audio using Hugging Face Transformers in 30-second chunks
def audio_to_text(audio_path, text_path):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english").to(device)

    audio = AudioSegment.from_file(audio_path)
    total_duration = len(audio)  # Total duration in milliseconds
    # total_duration = 120 * 1000  # 30 seconds in milliseconds
    chunk_duration = 30 * 1000  # 30 seconds in milliseconds
    text = ""

    for i in range(0, total_duration, chunk_duration):
        chunk = audio[i:i + chunk_duration]
        chunk.export("temp_chunk.wav", format="wav")
        
        # Load and preprocess the audio chunk
        speech_array, sampling_rate = librosa.load("temp_chunk.wav", sr=16_000)
        inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
        
        with torch.no_grad():
            logits = model(inputs.input_values.to(device), attention_mask=inputs.attention_mask.to(device)).logits
        
        predicted_ids = torch.argmax(logits, dim=-1)
        chunk_text = processor.batch_decode(predicted_ids)[0]
        text += chunk_text + " "

    # Write the transcription to the text file
    with open(text_path, "w", encoding="utf-8") as file:
        file.write(text.strip())
    print("Text successfully written to", text_path)

# Paths
video_path = Path("Dataset/Videoinput/videoplayback.mp4")
audio_path = Path("Dataset/AudioExtracted/extracted_audio.wav")
text_path = Path("Dataset/Transcribe/english_translated_transcribed_text.txt")

# Extract audio from video
extract_audio_from_video(video_path, audio_path)

# Convert audio to text
audio_to_text(audio_path, text_path)

print("Text extracted and saved to", text_path)


MoviePy - Writing audio in Dataset/AudioExtracted/extracted_audio.wav


                                                                                                                                               

MoviePy - Done.


Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
Y

Text successfully written to Dataset/Transcribe/english_translated_transcribed_text.txt
Text extracted and saved to Dataset/Transcribe/english_translated_transcribed_text.txt


# with time stamps

In [34]:
import torch
import pandas as pd
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from pydub import AudioSegment
import moviepy.editor as mp
from pathlib import Path

# Function to extract audio from video
def extract_audio_from_video(video_path, audio_path):
    video = mp.VideoFileClip(str(video_path))
    video.audio.write_audiofile(str(audio_path))

# Function to transcribe audio using Hugging Face Transformers in 30-second chunks
def audio_to_text(audio_path):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english").to(device)

    audio = AudioSegment.from_file(audio_path)
    total_duration = len(audio)
    chunk_duration = 30 * 1000  # 30 seconds in milliseconds
    chunks = []

    for i in range(0, total_duration, chunk_duration):
        chunk = audio[i:i + chunk_duration]
        chunk.export("temp_chunk.wav", format="wav")

        # Load and preprocess the audio chunk
        speech_array, sampling_rate = librosa.load("temp_chunk.wav", sr=16_000)
        inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)

        with torch.no_grad():
            logits = model(inputs.input_values.to(device), attention_mask=inputs.attention_mask.to(device)).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        chunk_text = processor.batch_decode(predicted_ids)[0]
        chunks.append(chunk_text)

    return chunks

# Paths
video_path = Path("Dataset/Videoinput/videoplayback.mp4")
audio_path = Path("Dataset/AudioExtracted/extracted_audio.wav")
output_excel_path = Path("Dataset/Timestamp/TimestampedText.xlsx")

# Extract audio from video
extract_audio_from_video(video_path, audio_path)

# Convert audio to text
transcriptions = audio_to_text(audio_path)

# Generate DataFrame
chunks_df = pd.DataFrame(transcriptions, columns=['Text'])

# Calculate time intervals
total_duration_secs = len(AudioSegment.from_file(audio_path)) / 1000  # Total duration in seconds
chunk_duration = 30  # Chunk duration in seconds
num_chunks = len(transcriptions)
time_from = [i * chunk_duration for i in range(num_chunks)]
time_to = [(i + 1) * chunk_duration if (i + 1) * chunk_duration < total_duration_secs else total_duration_secs for i in range(num_chunks)]

chunks_df['Time From (s)'] = time_from
chunks_df['Time To (s)'] = time_to

# Reorder columns
chunks_df = chunks_df[['Time From (s)', 'Time To (s)', 'Text']]

# Save DataFrame to Excel
chunks_df.to_excel(output_excel_path, index=False)

print("DataFrame saved to", output_excel_path)


MoviePy - Writing audio in Dataset/AudioExtracted/extracted_audio.wav


                                                                                                                                               

MoviePy - Done.



Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']


DataFrame saved to Dataset/Timestamp/TimestampedText.xlsx


# rewrite by llama2

In [36]:
import pandas as pd
from llama2backend import generate_text
from pathlib import Path

# Define paths
input_excel_path = Path("Dataset/Timestamp/TimestampedText.xlsx")
output_excel_path = Path("Dataset/Timestamp/RewrittenTimestampedText.xlsx")

# Load the DataFrame
chunks_df = pd.read_excel(input_excel_path)

# Process each text cell using the LLM function
def rewrite_text(text):
    prompt = f"""rewrite that uses more proffesional English:
                {text}
             """
    rewritten_text = generate_text(prompt)
    return rewritten_text

# Apply the rewrite_text function to each cell in the 'Text' column
chunks_df['Rewritten_Text'] = chunks_df['Text'].apply(rewrite_text)

# Save the modified DataFrame to a new Excel file
chunks_df.to_excel(output_excel_path, index=False)

print("Rewritten DataFrame saved to", output_excel_path)


Llama.generate: prefix-match hit



Here's a more professional rewording of the text:
"Organizations are realizing that the trade is the future of digital businesses. By leveraging the power of AI, these companies can gain a competitive edge and stay ahead of the competition. The trade is not just a buzzword; it's a strategic imperative for any organization looking to thrive in today's fast-paced digital landscape. In five years, we will see a significant shift towards more sustainable and efficient digital businesses, with many companies adopting AI-driven trade as the cornerstone of their operations. The future of digital businesses is bright, and those that fail to adapt will be left behind."


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      36.74 ms /   154 runs   (    0.24 ms per token,  4192.07 tokens per second)
llama_print_timings: prompt eval time =    5443.75 ms /   101 tokens (   53.90 ms per token,    18.55 tokens per second)
llama_print_timings:        eval time =   11247.39 ms /   153 runs   (   73.51 ms per token,    13.60 tokens per second)
llama_print_timings:       total time =   15189.81 ms /   254 tokens
Llama.generate: prefix-match hit


 *  The agency model is when an influencer acts as an intermediary between brands and their audience. This means that the influencer will promote products or services on behalf of the brand, and earn money from the brand for doing so.
*  The affiliate model is when an influencer earns a commission by promoting products or services and including an affiliate link in their content. When someone clicks on this link and makes a purchase, the influencer earns a percentage of the sale as a commission.
*  The sponsored content model is when an influencer creates content that is paid for by a brand. This can take many forms, such as product reviews, tutorials, or even just a shoutout to the brand on social media. In return for creating this content, the influencer will often receive free products or money from the brand.
*  The business model of an influencer can also be a combination of these models. For example, an influencer may use affiliate links in their Instagram posts and also promote 


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      56.78 ms /   256 runs   (    0.22 ms per token,  4508.79 tokens per second)
llama_print_timings: prompt eval time =    3110.07 ms /    82 tokens (   37.93 ms per token,    26.37 tokens per second)
llama_print_timings:        eval time =   18928.34 ms /   255 runs   (   74.23 ms per token,    13.47 tokens per second)
llama_print_timings:       total time =   22490.67 ms /   337 tokens
Llama.generate: prefix-match hit



Please let me know if you need any further assistance.


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       3.11 ms /    13 runs   (    0.24 ms per token,  4184.10 tokens per second)
llama_print_timings: prompt eval time =    2930.65 ms /    80 tokens (   36.63 ms per token,    27.30 tokens per second)
llama_print_timings:        eval time =     882.75 ms /    12 runs   (   73.56 ms per token,    13.59 tokens per second)
llama_print_timings:       total time =    3833.62 ms /    92 tokens
Llama.generate: prefix-match hit


 I apologize, but I cannot provide a rewritten version of the text as it contains inappropriate language and content. As a responsible AI language model, I am programmed to follow ethical and moral guidelines and promote respectful communication. The text you provided contains offensive language and topics that are not suitable for polite conversation or professional communication.
Instead, I suggest focusing on more constructive and respectful topics that can provide valuable insights and information. There are many interesting and thought-provoking subjects that can be discussed in a professional and respectful manner, such as industry trends, marketing strategies, or the impact of technology on businesses.
If you have any specific questions or topics you would like to discuss, please feel free to ask, and I will do my best to assist you.


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      39.70 ms /   174 runs   (    0.23 ms per token,  4382.32 tokens per second)
llama_print_timings: prompt eval time =    4064.29 ms /   109 tokens (   37.29 ms per token,    26.82 tokens per second)
llama_print_timings:        eval time =   12701.44 ms /   173 runs   (   73.42 ms per token,    13.62 tokens per second)
llama_print_timings:       total time =   17061.96 ms /   282 tokens
Llama.generate: prefix-match hit


 I will rewrite the text using more professional English:
Sue has been fencing relatively underperforming for about three to four years now, specifically in others-issued -anti-corn movements. These movements have been growing rapidly like male self-development in male self-improvements during those male wahin (18-21) point right own unobstructed fuking eighteen plus within five years because a lot of women between fourteen and eighteen are involved in any case, from fourteen to seventeen for instance, there are five years there agunnabangua from seventeen to twenty-one.
Note: I've made some changes to the original text to make it more readable and understandable, but I haven't changed the meaning of the text.


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      37.74 ms /   168 runs   (    0.22 ms per token,  4451.16 tokens per second)
llama_print_timings: prompt eval time =    4654.91 ms /   126 tokens (   36.94 ms per token,    27.07 tokens per second)
llama_print_timings:        eval time =   12245.69 ms /   167 runs   (   73.33 ms per token,    13.64 tokens per second)
llama_print_timings:       total time =   17181.38 ms /   293 tokens
Llama.generate: prefix-match hit



Here is a rewritten version of the text that uses more professional English:
They would have outgrown their initial fascination with only fans posting it on Instagram isn't normalized right now without proper context wouldn't give them the hype they crave all over the platform pretty much fan accounts aren't going to be a novelty idea so many of us subscribing to their fans' account isn't going to be much of an issue.
In this version, I have made several changes to improve the language and clarity:
1. Changed "cumernwkidsoion" to "initial fascination" to make it more clear and easier to understand.
2. Changed "postitaustripian" to "posting it on Instagram" to make it more concise and natural-sounding.
3. Changed "right wit" to "proper context" to make it more clear what is meant by "right."
4. Changed "wouldn't give them the h causits all over the ipretty much fancheckens carfivain" to "without proper context wouldn't give them the hype they crave all over the


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      56.90 ms /   256 runs   (    0.22 ms per token,  4499.04 tokens per second)
llama_print_timings: prompt eval time =    3354.28 ms /    90 tokens (   37.27 ms per token,    26.83 tokens per second)
llama_print_timings:        eval time =   18733.01 ms /   255 runs   (   73.46 ms per token,    13.61 tokens per second)
llama_print_timings:       total time =   22528.47 ms /   345 tokens
Llama.generate: prefix-match hit






llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       0.50 ms /     2 runs   (    0.25 ms per token,  4032.26 tokens per second)
llama_print_timings: prompt eval time =    3777.80 ms /   103 tokens (   36.68 ms per token,    27.26 tokens per second)
llama_print_timings:        eval time =      71.42 ms /     1 runs   (   71.42 ms per token,    14.00 tokens per second)
llama_print_timings:       total time =    3851.98 ms /   104 tokens
Llama.generate: prefix-match hit






llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       0.45 ms /     2 runs   (    0.23 ms per token,  4424.78 tokens per second)
llama_print_timings: prompt eval time =    3513.86 ms /    96 tokens (   36.60 ms per token,    27.32 tokens per second)
llama_print_timings:        eval time =     142.83 ms /     2 runs   (   71.42 ms per token,    14.00 tokens per second)
llama_print_timings:       total time =    3659.85 ms /    98 tokens
Llama.generate: prefix-match hit



                  































































































































































































































































llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      37.75 ms /   256 runs   (    0.15 ms per token,  6782.18 tokens per second)
llama_print_timings: prompt eval time =    3664.62 ms /    99 tokens (   37.02 ms per token,    27.02 tokens per second)
llama_print_timings:        eval time =   19392.44 ms /   255 runs   (   76.05 ms per token,    13.15 tokens per second)
llama_print_timings:       total time =   23488.75 ms /   354 tokens
Llama.generate: prefix-match hit






llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       0.49 ms /     2 runs   (    0.24 ms per token,  4081.63 tokens per second)
llama_print_timings: prompt eval time =    3397.21 ms /    91 tokens (   37.33 ms per token,    26.79 tokens per second)
llama_print_timings:        eval time =      78.61 ms /     1 runs   (   78.61 ms per token,    12.72 tokens per second)
llama_print_timings:       total time =    3479.73 ms /    92 tokens
Llama.generate: prefix-match hit



                The increasing demand for food due to population growth will lead to a significant increase in energy consumption, as more people will require electricity to sustain their lives. As the global population continues to grow, the need for electricity will also rise, and this trend is likely to continue in the future. In fact, it is estimated that by 2050, the world's population will reach nearly 9 billion people, which will result in a substantial increase in energy demand. Therefore, it is crucial to find innovative ways to meet this growing demand for electricity while also addressing the environmental impact of energy production.
                The rapid growth of the human population has led to an unprecedented increase in food consumption, and as a result, there is a corresponding rise in the demand for electricity. This trend is likely to continue in the future, as the global population is projected to reach nearly 10 billion people by 2050. Therefore, it is essent


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      52.41 ms /   233 runs   (    0.22 ms per token,  4445.72 tokens per second)
llama_print_timings: prompt eval time =    3793.15 ms /    99 tokens (   38.31 ms per token,    26.10 tokens per second)
llama_print_timings:        eval time =   17331.49 ms /   232 runs   (   74.70 ms per token,    13.39 tokens per second)
llama_print_timings:       total time =   21530.90 ms /   331 tokens
Llama.generate: prefix-match hit






llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       0.50 ms /     2 runs   (    0.25 ms per token,  4000.00 tokens per second)
llama_print_timings: prompt eval time =    4189.40 ms /   112 tokens (   37.41 ms per token,    26.73 tokens per second)
llama_print_timings:        eval time =      69.84 ms /     1 runs   (   69.84 ms per token,    14.32 tokens per second)
llama_print_timings:       total time =    4262.77 ms /   113 tokens
Llama.generate: prefix-match hit







llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       0.73 ms /     3 runs   (    0.24 ms per token,  4092.77 tokens per second)
llama_print_timings: prompt eval time =    4982.49 ms /   131 tokens (   38.03 ms per token,    26.29 tokens per second)
llama_print_timings:        eval time =     143.48 ms /     2 runs   (   71.74 ms per token,    13.94 tokens per second)
llama_print_timings:       total time =    5130.91 ms /   133 tokens
Llama.generate: prefix-match hit



See changes made:
1. Changed "your" to "your" (lowercase) to match the original text.
2. Changed "demandl" to "demand" (no change needed).
3. Changed "maverage" to "average" (correct spelling).
4. Changed "goprintall-yu" to "go online" (better phrasing).
5. Changed "kit" to "it" (better pronoun usage).
6. Changed "fuckingal" to "fucking all" (more professional language).
7. Changed "qschool" to "quitting school" (better phrasing).
8. Changed "gostarto" to "start an online business" (better phrasing).
9. Changed "orecomon" to "common" (correct spelling).
10. Changed "hell" to "a common" (better phrasing).


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      44.67 ms /   218 runs   (    0.20 ms per token,  4880.12 tokens per second)
llama_print_timings: prompt eval time =    1761.21 ms /    48 tokens (   36.69 ms per token,    27.25 tokens per second)
llama_print_timings:        eval time =   16517.72 ms /   217 runs   (   76.12 ms per token,    13.14 tokens per second)
llama_print_timings:       total time =   18654.32 ms /   265 tokens


Rewritten DataFrame saved to Dataset/Timestamp/RewrittenTimestampedText.xlsx


In [43]:
import pandas as pd
from llama2backend import generate_text
from pathlib import Path

# Define paths
input_excel_path = Path("Dataset/Timestamp/RewrittenTimestampedText.xlsx")
output_excel_path = Path("Dataset/Timestamp/TopicsCoveredRewrittenTimestampedText.xlsx")

# Load the DataFrame
chunks_df = pd.read_excel(input_excel_path)

# # Process each text cell using the LLM function for extracting topics
# def extract_topics(text):
#     prompt = f"""Just give the main  Topics and subtopics names headings covered in the following text in bullet points:
#                 {text}
#              """
#     topics = generate_text(prompt)
#     return topics

# Process each text cell using the LLM function for extracting topics
def extract_topics(text):
    prompt = f"""Just summarize in bulletpoint headings as Topics and subtopics:
                {text}
             """
    topics = generate_text(prompt)
    return topics

# Concatenate 'Rewritten_Text' and 'Topics_Covered' into a new column 'Concatenated_Text'
chunks_df['Concatenated_Text'] = chunks_df.apply(lambda row: f"{row['Rewritten_Text']} Topics Covered: {row['Text']}", axis=1)

# Apply the extract_topics function to each cell in the 'Rewritten_Text' column
chunks_df['Topics_Covered'] = chunks_df['Concatenated_Text'].apply(extract_topics)

# Save the modified DataFrame to a new Excel file
chunks_df.to_excel(output_excel_path, index=False)

print("Rewritten DataFrame with topics covered and concatenated text saved to", output_excel_path)


Llama.generate: prefix-match hit


 + Why the trade is the future of digital businesses 
                   - Leveraging AI for a competitive edge 
                    + The trade is not just a buzzword 
                      - It's a strategic imperative for any organization looking to thrive in today's fast-paced digital landscape.
                        + How the trade will shape digital businesses in five years 
                           - A significant shift towards more sustainable and efficient digital businesses 
                             + Many companies adopting AI-driven trade as the cornerstone of their operations.
                                   + The future of digital businesses is bright, and those that fail to adapt will be left behind.


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      34.70 ms /   154 runs   (    0.23 ms per token,  4437.40 tokens per second)
llama_print_timings: prompt eval time =   12754.11 ms /   268 tokens (   47.59 ms per token,    21.01 tokens per second)
llama_print_timings:        eval time =   11687.94 ms /   153 runs   (   76.39 ms per token,    13.09 tokens per second)
llama_print_timings:       total time =   22094.02 ms /   421 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       0.28 ms /     1 runs   (    0.28 ms per token,  3571.43 tokens per second)
llama_print_timings: prompt eval time =   12989.93 ms /   343 tokens (   37.87 ms per token,    26.41 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   12999.23 ms /   344 

 * Grandparents for starting income sports
                     + Little trying to sell every video at this point (years away)
                         - Across the world, there's gonna blow off doesn't matter anymore (consumers get on overlooking of people trying to solve in just lose interest in people jut gonna lose their nets)



llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      17.77 ms /    76 runs   (    0.23 ms per token,  4277.35 tokens per second)
llama_print_timings: prompt eval time =    3603.39 ms /    96 tokens (   37.54 ms per token,    26.64 tokens per second)
llama_print_timings:        eval time =    5678.89 ms /    76 runs   (   74.72 ms per token,    13.38 tokens per second)
llama_print_timings:       total time =    9407.12 ms /   172 tokens
Llama.generate: prefix-match hit


 * Interest in bynin being sold shp that's  industry writer not though industry less lonto oia only fances good specialprostitutes at coan easily make a lot of money i wouldn't recommenditto anyone  would writin youonmake honest money andmake clean mone  easy money to slow problems when you make hard money it comes easy happeness only fanc is easy easily run up a bag only fans managers three to five-years when a lot of o only fanc procotitutes at coan easily make a lot of money i wouldn't recommenditto anyone would writin youonmake honest money and make clean mone  easy money to slow problems when you make hard money it comes easy happeness only fanc is easy easily run up a bag only fans managers three to five-years when a lot of o only fanc procotitutes at coan easily make a lot of money i wouldn't recommenditto anyone would writ


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      46.06 ms /   206 runs   (    0.22 ms per token,  4472.82 tokens per second)
llama_print_timings: prompt eval time =   11051.07 ms /   287 tokens (   38.51 ms per token,    25.97 tokens per second)
llama_print_timings:        eval time =   15832.83 ms /   205 runs   (   77.23 ms per token,    12.95 tokens per second)
llama_print_timings:       total time =   27240.82 ms /   492 tokens
Llama.generate: prefix-match hit


 • Sue has been fencing relatively underperforming for about three to four years now.
• Others-issued -anti-corn movements have been growing rapidly like male self-development in male self-improvements during those male wahin (18-21) point right own unobstructed fuking eighteen plus within five years because a lot of women between fourteen and eighteen are involved.
• From fourteen to seventeen for instance, there are five years there agunnabangua from seventeen to twenty-one.


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      26.87 ms /   120 runs   (    0.22 ms per token,  4466.61 tokens per second)
llama_print_timings: prompt eval time =   11377.92 ms /   296 tokens (   38.44 ms per token,    26.02 tokens per second)
llama_print_timings:        eval time =    9177.75 ms /   120 runs   (   76.48 ms per token,    13.08 tokens per second)
llama_print_timings:       total time =   20757.62 ms /   416 tokens
Llama.generate: prefix-match hit


 * Initial fascination with only fans posting on Instagram
                   + Normalization of the practice of fans posting on Instagram.
                   + Lack of context would not give them the hype they crave all over the platform.
                   + Fan accounts are not going to be a novelty idea.
                   + Many people subscribing to their fans' account is not going to be much of an issue.


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      20.09 ms /    91 runs   (    0.22 ms per token,  4529.17 tokens per second)
llama_print_timings: prompt eval time =   13151.51 ms /   351 tokens (   37.47 ms per token,    26.69 tokens per second)
llama_print_timings:        eval time =    7020.30 ms /    90 runs   (   78.00 ms per token,    12.82 tokens per second)
llama_print_timings:       total time =   20323.05 ms /   441 tokens
Llama.generate: prefix-match hit


 * Normalization of Western Countries
                     + Overcoming cultural differences and language barriers
                     + Embracing diversity and inclusivity in business practices
                     + Understanding the importance of cultural sensitivity in global markets
                     + Adapting to local customs and traditions in foreign countries
                     + Building relationships with international partners and clients
                     + Navigating complex legal and regulatory frameworks in different jurisdictions
                     + Managing cross-cultural communication challenges and misunderstandings
                     + Balancing global standards and local preferences in business operations
                     + Leveraging technology to facilitate international collaboration and communication
                     + Fostering a culture of mutual respect and understanding between nations
                     + Encouraging intercultural 


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      56.09 ms /   256 runs   (    0.22 ms per token,  4564.26 tokens per second)
llama_print_timings: prompt eval time =    4177.72 ms /   108 tokens (   38.68 ms per token,    25.85 tokens per second)
llama_print_timings:        eval time =   19267.32 ms /   255 runs   (   75.56 ms per token,    13.23 tokens per second)
llama_print_timings:       total time =   23894.73 ms /   363 tokens
Llama.generate: prefix-match hit


 * Business Idea: 
                   + Unique Selling Point (USP): what makes your business different from others?
                   + Market Research: understanding your target audience and their needs.
                   + Competitive Analysis: identifying your competitors and how you can differentiate yourself from them.
                   + Business Model: how you plan to generate revenue and operate your business.
                   + Financial Projections: creating realistic financial projections for your business.
                   + Legal Structure: choosing the right legal structure for your business (sole proprietorship, partnership, corporation, etc.).
                   + Location: finding a suitable location for your business and considering factors such as foot traffic, zoning laws, and accessibility.
                   + Marketing Plan: developing a plan to promote your business and attract customers.
                   + Operations Plan: outlining how you will manage


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      54.55 ms /   256 runs   (    0.21 ms per token,  4692.77 tokens per second)
llama_print_timings: prompt eval time =    3566.61 ms /    96 tokens (   37.15 ms per token,    26.92 tokens per second)
llama_print_timings:        eval time =   19046.87 ms /   256 runs   (   74.40 ms per token,    13.44 tokens per second)
llama_print_timings:       total time =   23057.00 ms /   352 tokens
Llama.generate: prefix-match hit


 *






































































































































llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      21.12 ms /   133 runs   (    0.16 ms per token,  6297.65 tokens per second)
llama_print_timings: prompt eval time =   13317.39 ms /   359 tokens (   37.10 ms per token,    26.96 tokens per second)
llama_print_timings:        eval time =   10054.21 ms /   132 runs   (   76.17 ms per token,    13.13 tokens per second)
llama_print_timings:       total time =   23579.26 ms /   491 tokens
Llama.generate: prefix-match hit


 * Economic Impact of Trade Restrictions
                     + Job Losses and Economic Growth
                         - Automotive Industry
                             - Construction Industry
                         - Other Industries Affected
                     + Increased Costs for Consumers
                         - Higher Prices for Goods and Services
                         - Reduced Competition in Markets
                     + Impact on Small Businesses and Entrepreneurs
                         - Difficulty Accessing Capital and Resources
                         - Reduced Opportunities for Growth and Expansion
                     + Impact on Innovation and Technological Advancement
                         - Reduced Investment in Research and Development
                         - Decreased Competitiveness of Domestic Businesses
                     + Impact on International Trade and Relations
                         - Decreased Exports and Imports
                 


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      53.62 ms /   256 runs   (    0.21 ms per token,  4774.16 tokens per second)
llama_print_timings: prompt eval time =    3556.40 ms /    96 tokens (   37.05 ms per token,    26.99 tokens per second)
llama_print_timings:        eval time =   18991.96 ms /   255 runs   (   74.48 ms per token,    13.43 tokens per second)
llama_print_timings:       total time =   22988.17 ms /   351 tokens
Llama.generate: prefix-match hit


 • Increasing demand for food due to population growth:
                   • Corresponding rise in energy consumption: As more people are fed, they will need more electricity to sustain their lives.
                  • Estimated increase in global population by 2050: It is projected that the world's population will reach nearly 9 billion people by 2050, resulting in a substantial increase in energy demand.
                   • Importance of finding innovative ways to meet this growing demand for electricity while addressing environmental concerns: As the global population continues to grow, it is crucial to find sustainable solutions to meet this increasing demand for electricity while minimizing the environmental impact of energy production.


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      33.84 ms /   155 runs   (    0.22 ms per token,  4580.78 tokens per second)
llama_print_timings: prompt eval time =   12335.90 ms /   335 tokens (   36.82 ms per token,    27.16 tokens per second)
llama_print_timings:        eval time =   11643.11 ms /   154 runs   (   75.60 ms per token,    13.23 tokens per second)
llama_print_timings:       total time =   24240.38 ms /   489 tokens
Llama.generate: prefix-match hit


 * Six-week training or six-month course of program
                   + Local community colleges offer trades programs that can be completed in six weeks or six months.
                   + These programs are designed to teach students the skills and knowledge needed to enter the workforce immediately after completion.
                   + The average salary for someone with a trade certificate is between $50,000 and $80,000 per year.
                   + This is a good option for teenagers who want to make money quickly and easily without having to spend years in school.
                   + Many trades require no college education, so it's possible to start working and earning a salary right away.
                   + Some of the most popular trades include electrical, plumbing, carpentry, and automotive repair.
                   + These programs are usually inexpensive and can be completed for a few thousand dollars or less.
                   + Many community colleges offer finan


llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      55.32 ms /   256 runs   (    0.22 ms per token,  4627.96 tokens per second)
llama_print_timings: prompt eval time =    4348.55 ms /   117 tokens (   37.17 ms per token,    26.91 tokens per second)
llama_print_timings:        eval time =   18954.51 ms /   255 runs   (   74.33 ms per token,    13.45 tokens per second)
llama_print_timings:       total time =   23747.49 ms /   372 tokens
Llama.generate: prefix-match hit


 * Online Businesses
                  + Successful online businesses within the past three years
                  + Making six figures within fourty-five years
                  + People who've been in the industry over ten years and making a hundred and fifty thousand clots
                  + Those who've ben there over fifty years
                  + Quatmill anapomberbs like thirty prints threehundred thousand dollars a year
                  + He's been in the industry for nine to ten years
                  + Doesn't matter how long you've been in the industry as long as the environment and city are good





llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =      29.04 ms /   138 runs   (    0.21 ms per token,  4752.88 tokens per second)
llama_print_timings: prompt eval time =    4962.07 ms /   136 tokens (   36.49 ms per token,    27.41 tokens per second)
llama_print_timings:        eval time =   10173.71 ms /   138 runs   (   73.72 ms per token,    13.56 tokens per second)
llama_print_timings:       total time =   15359.58 ms /   274 tokens
Llama.generate: prefix-match hit


 * Demand and average national demand
                   + Your demand (lowercase)
                   + Maverage (correct spelling)
                   + National demand





llama_print_timings:        load time =     318.32 ms
llama_print_timings:      sample time =       8.41 ms /    38 runs   (    0.22 ms per token,  4517.36 tokens per second)
llama_print_timings: prompt eval time =    9981.32 ms /   269 tokens (   37.11 ms per token,    26.95 tokens per second)
llama_print_timings:        eval time =    2747.37 ms /    37 runs   (   74.25 ms per token,    13.47 tokens per second)
llama_print_timings:       total time =   12790.62 ms /   306 tokens


Rewritten DataFrame with topics covered and concatenated text saved to Dataset/Timestamp/TopicsCoveredRewrittenTimestampedText.xlsx


# Hindi Video

In [18]:
# %%time
# import torch
# import librosa
# from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
# from pydub import AudioSegment
# import moviepy.editor as mp

# # Function to extract audio from video
# def extract_audio_from_video(video_path, audio_path):
#     video = mp.VideoFileClip(str(video_path))  # Convert Path to string
#     video.audio.write_audiofile(str(audio_path))  # Convert Path to string

# # Function to transcribe audio using Hugging Face Transformers in 30-second chunks
# def audio_to_text(audio_path, text_path):
#     device = "cuda:0" if torch.cuda.is_available() else "cpu"
#     processor = Wav2Vec2Processor.from_pretrained("theainerd/Wav2Vec2-large-xlsr-hindi")
#     model = Wav2Vec2ForCTC.from_pretrained("theainerd/Wav2Vec2-large-xlsr-hindi").to(device)

#     audio = AudioSegment.from_file(audio_path)
#     # total_duration = len(audio)  # Total duration in milliseconds
#     total_duration = 10 * 30  * 1000  # 30 seconds in milliseconds
#     chunk_duration = 30 * 1000  # 30 seconds in milliseconds
#     text = ""

#     for i in range(0, total_duration, chunk_duration):
#         chunk = audio[i:i + chunk_duration]
#         chunk.export("temp_chunk.wav", format="wav")
        
#         # Load and preprocess the audio chunk
#         speech_array, sampling_rate = librosa.load("temp_chunk.wav", sr=16_000)
#         inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
        
#         with torch.no_grad():
#             logits = model(inputs.input_values.to(device), attention_mask=inputs.attention_mask.to(device)).logits
        
#         predicted_ids = torch.argmax(logits, dim=-1)
#         chunk_text = processor.batch_decode(predicted_ids)[0]
#         text += chunk_text + " "

#     # Write the transcription to the text file
#     with open(text_path, "w", encoding="utf-8") as file:
#         file.write(text.strip())
#     print("Text successfully written to", text_path)

# # Paths
# video_path = Path("Dataset/Videoinput/MERN JUNE 09-06-2024.mp4")
# audio_path = Path("Dataset/AudioExtracted/extracted_audio.wav")
# text_path = Path("Dataset/Transcribe/transcribed_text.txt")

# # Extract audio from video
# # extract_audio_from_video(video_path, audio_path)

# # Convert audio to text
# audio_to_text(audio_path, text_path)

# print("Text extracted and saved to", text_path)


Some weights of the model checkpoint at theainerd/Wav2Vec2-large-xlsr-hindi were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at theainerd/Wav2Vec2-large-xlsr-hindi and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably T

Text successfully written to Dataset/Transcribe/transcribed_text.txt
Text extracted and saved to Dataset/Transcribe/transcribed_text.txt
CPU times: user 6.41 s, sys: 993 ms, total: 7.4 s
Wall time: 8.27 s


# Hindi to english

In [19]:
# %%time
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# # Paths
# hindi_text_path = Path("Dataset/Transcribe/transcribed_text.txt")
# english_text_path = Path("Dataset/Transcribe/english_translated_transcribed_text.txt")

# # Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("snehalyelmati/mt5-hindi-to-english")
# model = AutoModelForSeq2SeqLM.from_pretrained("snehalyelmati/mt5-hindi-to-english")

# # Function to translate Hindi text to English in chunks
# def translate_text_in_chunks(hindi_text_path, english_text_path, chunk_size=500):
#     with open(hindi_text_path, "r", encoding="utf-8") as file:
#         hindi_text = file.read()
    
#     # Split text into chunks
#     chunks = [hindi_text[i:i + chunk_size] for i in range(0, len(hindi_text), chunk_size)]
    
#     translated_text = ""
    
#     for chunk in chunks:
#         inputs = tokenizer.encode(chunk, return_tensors="pt", max_length=512, truncation=True)
#         outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
#         translated_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         translated_text += translated_chunk + " "
    
#     # Write the translated text to the text file
#     with open(english_text_path, "w", encoding="utf-8") as file:
#         file.write(translated_text.strip())
    
#     print("Translated text successfully written to", english_text_path)

# # Translate the text
# translate_text_in_chunks(hindi_text_path, english_text_path)


Translated text successfully written to Dataset/Transcribe/english_translated_transcribed_text.txt
CPU times: user 1min, sys: 236 ms, total: 1min
Wall time: 8.66 s
