In [1]:
import requests
import json
import zipfile
import io
import pysrt
import re
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm
import os
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
prefix_path = parent_dir

In [3]:
def get_imdb_subtitles(imdb_id):
    """
    Sends a GET request to the Wizdom API to fetch subtitle data for a given IMDb ID.
    Args:
        imdb_id (str): The IMDb ID of the movie or TV series.
    Returns:
        list: The JSON response as a list of subtitle records.
    """
    base_url = "https://wizdom.xyz/api/search"
    params = {
        "action": "by_id",
        "imdb": imdb_id
    }
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for IMDb ID {imdb_id}: {e}")
        return []

In [4]:
def get_subtitle_file(sub_id):
    """
    Sends a GET request to the Wizdom API to fetch the subtitle file for a given subID.
    
    Args:
        sub_id (int): The subtitle ID to fetch.
        
    Returns:
        Response: The response object from the GET request.
    """
    base_url = "https://wizdom.xyz/api/files/sub/"
    url = f"{base_url}{sub_id}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        return response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

In [5]:
imdb_id = "tt0086827" 
response_json_sub_ids = get_imdb_subtitles(imdb_id)

if response_json_sub_ids:
    print("Subtitles data fetched successfully!")
    # Pretty-print the JSON response
    print(json.dumps(response_json_sub_ids[:5], indent=4, ensure_ascii=False))  # ensure_ascii=False for non-ASCII characters
else:
    print("Failed to fetch subtitles data.")


Subtitles data fetched successfully!
[
    {
        "id": 275841,
        "versioname": "Boss.S02E05.720p.HDTV.x264-EVOLVE",
        "score": 0
    },
    {
        "id": 275842,
        "versioname": "Boss.S02E07.720p.HDTV.x264-EVOLVE",
        "score": 0
    },
    {
        "id": 337379,
        "versioname": "S03E07.Jonathan.the.Gymnast",
        "score": 0
    },
    {
        "id": 337380,
        "versioname": "S03E12.The.Way.We.Was.WEBDL",
        "score": 0
    },
    {
        "id": 337327,
        "versioname": "Who's.the.Boss.-.01x04.-.Mona.Gets.Pinned.DVDRip.heb",
        "score": 0
    }
]


In [6]:
df_imdb = pd.read_csv(os.path.join(prefix_path, "data", "imdb_episodes_with_season.csv"))
df_imdb

  df_imdb = pd.read_csv(os.path.join(prefix_path, "data", "imdb_episodes_with_season.csv"))


Unnamed: 0,tconst_episode,tconst_season,seasonNumber,episodeNumber,title_episode,runtimeMinutes,title_series,averageRating_season,numVotes_series,averageRating_episode,numVotes_episode,Year_episode
0,tt0031458,tt32857063,,,El huésped del sevillano,86.0,Teatro lírico español,,,6.9,15.0,1970.0
1,tt0041951,tt0041038,1.0,9.0,The Tenderfeet,30.0,The Lone Ranger,7.7,3012.0,7.6,98.0,1949.0
2,tt0042816,tt0989125,1.0,17.0,Othello,143.0,BBC Sunday-Night Theatre,7.0,188.0,7.6,12.0,1950.0
3,tt0042889,tt0989125,,,The Tragedy of King Richard II/II,145.0,BBC Sunday-Night Theatre,7.0,188.0,,,1950.0
4,tt0043426,tt0040051,3.0,42.0,Coriolanus,60.0,Studio One,7.5,344.0,,,1951.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8679288,tt9916846,tt1289683,3.0,18.0,Episode #3.18,,Arka Siradakiler,3.6,632.0,,,2009.0
8679289,tt9916848,tt1289683,3.0,17.0,Episode #3.17,,Arka Siradakiler,3.6,632.0,,,2009.0
8679290,tt9916850,tt1289683,3.0,19.0,Episode #3.19,,Arka Siradakiler,3.6,632.0,,,2010.0
8679291,tt9916852,tt1289683,3.0,20.0,Episode #3.20,,Arka Siradakiler,3.6,632.0,,,2010.0


In [8]:
df_imdb[df_imdb['tconst_season'] == 'tt0086827']

Unnamed: 0,tconst_episode,tconst_season,seasonNumber,episodeNumber,title_episode,runtimeMinutes,title_series,averageRating_season,numVotes_series,averageRating_episode,numVotes_episode,Year_episode
255300,tt0747696,tt0086827,4.0,9.0,A Fishy Tale,24.0,Who's the Boss?,6.6,16062.0,6.8,85.0,1987.0
255301,tt0747697,tt0086827,5.0,5.0,A Jack Story,24.0,Who's the Boss?,6.6,16062.0,7.2,76.0,1988.0
255302,tt0747698,tt0086827,3.0,24.0,A Moving Episode,24.0,Who's the Boss?,6.6,16062.0,7.1,90.0,1987.0
255303,tt0747699,tt0086827,1.0,5.0,A Rash Decision,30.0,Who's the Boss?,6.6,16062.0,7.2,142.0,1984.0
255304,tt0747700,tt0086827,5.0,8.0,A Spirited Christmas,24.0,Who's the Boss?,6.6,16062.0,7.0,80.0,1988.0
...,...,...,...,...,...,...,...,...,...,...,...,...
322640,tt0842301,tt0086827,3.0,15.0,"Tony, the Patchmaker",24.0,Who's the Boss?,6.6,16062.0,6.8,82.0,1987.0
322796,tt0842470,tt0086827,4.0,5.0,New Kid in Town,24.0,Who's the Boss?,6.6,16062.0,7.1,90.0,1987.0
323033,tt0843169,tt0086827,5.0,7.0,Life with Father,24.0,Who's the Boss?,6.6,16062.0,6.5,70.0,1988.0
323034,tt0843170,tt0086827,5.0,10.0,Mrs. Rossini's Uncle,24.0,Who's the Boss?,6.6,16062.0,6.6,76.0,1989.0


In [11]:
# run process_data.py and save "subtitles_data.csv"

In [12]:
df_subtitles = pd.read_csv(os.path.join(prefix_path, "data", "subtitles_data.csv"))
df_subtitles.head(500)

Unnamed: 0,tconst_season,id,versioname
0,tt0060028,187892,Star.Trek-.The.Next.Generation.S03E01.Evolution
1,tt0060028,187893,Star.Trek-.The.Next.Generation.S03E02.The.Ensi...
2,tt0060028,187894,Star.Trek-.The.Next.Generation.S03E03.The.Surv...
3,tt0060028,187895,Star.Trek-.The.Next.Generation.S03E04.Who.Watc...
4,tt0060028,187896,Star.Trek-.The.Next.Generation.S03E05.The.Bonding
...,...,...,...
495,tt0092455,188002,star.trek.tng.s06e25.720p.bluray.x264-geckos
496,tt0092455,189699,Star.Trek.TNG.S06E26.720p.BluRay.x264-GECKOS
497,tt0092455,224682,Star.Trek_.The.Next.Generation.S01E02.WEBRip
498,tt0092455,224683,Star.Trek_.The.Next.Generation.S01E03.WEBRip


In [13]:
df_subtitles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202134 entries, 0 to 202133
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   tconst_season  202134 non-null  object
 1   id             202134 non-null  int64 
 2   versioname     202134 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.6+ MB


In [14]:
def extract_season_episode(versioname):
    # 1. Look for S{num}E{num} (case insensitive) with or without a dot
    match = re.search(r'[sS](\d+)[\.\s]?[eE](\d+)', versioname)
    if match:
        season = int(match.group(1))
        episode = int(match.group(2))
        return [(season, episode)]
    
    # 2. Look for patterns like 7x09 or 7x21+22
    match = re.search(r'(\d+)x(\d+)(?:\+(\d+))?', versioname)
    if match:
        season = int(match.group(1))
        episode1 = int(match.group(2))
        episode2 = match.group(3)  # Check for a second episode
        if episode2:
            return [(season, episode1), (season, int(episode2))]
        return [(season, episode1)]
    
    # 3. Look for standalone E{num} (case insensitive) without S (assume season 1)
    match = re.search(r'[eE](\d+)', versioname)
    if match:
        season = 1
        episode = int(match.group(1))
        return [(season, episode)]
    
    # Return None if no pattern is matched
    return None

In [15]:
extract_season_episode("Seinfeld.7x09")

[(7, 9)]

In [16]:
extract_season_episode("this.is.us.s07e06.internal.720p.web.x264-bamboozle")

[(7, 6)]

In [17]:
def process_subtitles(df):
    new_rows = []  # To store new rows for split records
    
    for _, row in df.iterrows():
        season_episode_pairs = extract_season_episode(row['versioname'])
        if season_episode_pairs:
            for season, episode in season_episode_pairs:
                new_row = row.copy()
                new_row['season'] = season
                new_row['episode'] = episode
                new_rows.append(new_row)
        else:
            # Append original row with NaN for season and episode if no match
            new_row = row.copy()
            new_row['season'] = None
            new_row['episode'] = None
            new_rows.append(new_row)
    
    # Create new DataFrame with added season and episode columns
    new_df = pd.DataFrame(new_rows)
    return new_df

In [18]:
df_processed_subtitles = process_subtitles(df_subtitles)

In [19]:
df_processed_subtitles.head(100)

Unnamed: 0,tconst_season,id,versioname,season,episode
0,tt0060028,187892,Star.Trek-.The.Next.Generation.S03E01.Evolution,3.0,1.0
1,tt0060028,187893,Star.Trek-.The.Next.Generation.S03E02.The.Ensi...,3.0,2.0
2,tt0060028,187894,Star.Trek-.The.Next.Generation.S03E03.The.Surv...,3.0,3.0
3,tt0060028,187895,Star.Trek-.The.Next.Generation.S03E04.Who.Watc...,3.0,4.0
4,tt0060028,187896,Star.Trek-.The.Next.Generation.S03E05.The.Bonding,3.0,5.0
...,...,...,...,...,...
95,tt0060028,188109,Star.Trek.TOS.S03E21.720p.BluRay.x264-SiNNERS,3.0,21.0
96,tt0060028,188110,Star.Trek.TOS.S03E22.720p.BluRay.x264-SiNNERS,3.0,22.0
97,tt0060028,188095,Star.Trek.TOS.S03E23.720p.BluRay.x264-SiNNERS,3.0,23.0
98,tt0060028,188096,Star.Trek.TOS.S03E24.720p.BluRay.x264-SiNNERS,3.0,24.0


In [20]:
df_processed_subtitles[df_processed_subtitles['tconst_season'] == 'tt0094574']


Unnamed: 0,tconst_season,id,versioname,season,episode
38572,tt0094574,291918,Unsolved.S01E01.iNTERNAL.1080p.WEB.x264-EDHD,1.0,1.0


In [52]:
df_processed_subtitles.isnull().sum()

tconst_season       0
id                  0
versioname          0
season           2023
episode          2023
dtype: int64

In [53]:
df_null = df_processed_subtitles[df_processed_subtitles.isnull().any(axis=1)]

In [54]:
df_null.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2023 entries, 751 to 201093
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst_season  2023 non-null   object 
 1   id             2023 non-null   int64  
 2   versioname     2023 non-null   object 
 3   season         0 non-null      float64
 4   episode        0 non-null      float64
dtypes: float64(2), int64(1), object(2)
memory usage: 94.8+ KB


In [55]:
# Function to extract season and episode based on complex conditions
def extract_season_episode_null(versioname, tconst_season):
    # 1. Look for "Season {num} Episode {num}" or "Season.{num}.Episode-{num}"
    match = re.search(r'Season\D*(\d+)\D*Episode\D*(\d+)', versioname, re.IGNORECASE)
    if match:
        return [(int(match.group(1)), int(match.group(2)))]

    # 2. South Park specific patterns: numbers like 1207 (Season 12, Episode 7)
    if versioname.lower().startswith("south.park"):
        match = re.search(r'(\d)(\d{2})', versioname)
        if match:
            return [(int(match.group(1)), int(match.group(2)))]

    # 3. For specific tconst_season == 'tt0096697': Look for Episode.{num} (assume season 1)
    if tconst_season == 'tt0096697':
        match = re.search(r'Episode\D*(\d+)', versioname, re.IGNORECASE)
        if match:
            return [(1, int(match.group(1)))]

    # 4. For tconst_season == 'tt0417299': numbers starting with 3 indicate season 3
    if tconst_season == 'tt0417299':
        match = re.search(r'3(\d+)', versioname)
        if match:
            return [(3, int(match.group(1)))]

    # 5. For specific tconst_season IDs: Look for patterns like 516 (Season 5, Episode 16)
    if tconst_season in ['tt0182576', 'tt0460649', 'tt0367345', 'tt0455275']:
        match = re.search(r'(\d)(\d{2})', versioname)
        if match:
            return [(int(match.group(1)), int(match.group(2)))]

    # 6. Standard S{num}E{num} (case insensitive) with or without a dot
    match = re.search(r'[sS](\d+)[\.\s]?[eE](\d+)', versioname)
    if match:
        return [(int(match.group(1)), int(match.group(2)))]

    # 7. Look for patterns like 7x09 or 7x21+22
    match = re.search(r'(\d+)x(\d+)(?:\+(\d+))?', versioname)
    if match:
        season = int(match.group(1))
        episode1 = int(match.group(2))
        episode2 = match.group(3)  # Optional second episode
        if episode2:
            return [(season, episode1), (season, int(episode2))]
        return [(season, episode1)]

    # 8. Look for standalone E{num} (assume season 1)
    match = re.search(r'[eE](\d+)', versioname)
    if match:
        return [(1, int(match.group(1)))]

    # Return None if no patterns matched
    return None


In [56]:
# Function to process the DataFrame and extract season/episode
def process_subtitles_null(df):
    new_rows = []  # Store new rows for split records
    
    for _, row in df.iterrows():
        season_episode_pairs = extract_season_episode_null(row['versioname'], row['tconst_season'])
        
        if season_episode_pairs:
            for season, episode in season_episode_pairs:
                new_row = row.copy()
                new_row['season'] = season
                new_row['episode'] = episode
                new_rows.append(new_row)
        else:
            # Append original row with NaN if no match is found
            new_row = row.copy()
            new_row['season'] = None
            new_row['episode'] = None
            new_rows.append(new_row)

    # Create a new DataFrame with updated records
    new_df = pd.DataFrame(new_rows)
    return new_df

In [57]:
df_null_processed = process_subtitles_null(df_null)

In [58]:
df_null_processed.isnull().sum()

tconst_season       0
id                  0
versioname          0
season           1729
episode          1729
dtype: int64

In [59]:
df_processed_subtitles.to_csv(os.path.join(prefix_path, "data", "processed_subtitles.csv"), index=False)

In [60]:
df_processed_subtitles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 202140 entries, 0 to 202133
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst_season  202140 non-null  object 
 1   id             202140 non-null  int64  
 2   versioname     202140 non-null  object 
 3   season         200117 non-null  float64
 4   episode        200117 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 9.3+ MB


In [63]:
df_subtitles[df_subtitles['tconst_season'] == 'tt0094525']


Unnamed: 0,tconst_season,id,versioname
1049,tt0094525,235658,Agatha.Christie's.Poirot.S01E01.1080p.Bluray.2...
1050,tt0094525,235659,Agatha.Christie's.Poirot.S01E02.1080p.Bluray.2...
1051,tt0094525,235660,Agatha.Christie's.Poirot.S01E03.1080p.Bluray.2...
1052,tt0094525,235661,Agatha.Christie's.Poirot.S01E04.1080p.Bluray.2...
1053,tt0094525,235662,Agatha.Christie's.Poirot.S01E05.1080p.Bluray.2...
1054,tt0094525,235663,Agatha.Christie's.Poirot.S01E06.1080p.Bluray.2...
1055,tt0094525,235664,Agatha.Christie's.Poirot.S01E07.1080p.Bluray.2...
1056,tt0094525,235665,Agatha.Christie's.Poirot.S01E08.1080p.Bluray.2...
1057,tt0094525,235666,Agatha.Christie's.Poirot.S01E09.1080p.Bluray.2...
1058,tt0094525,235667,Agatha.Christie's.Poirot.S01E10.1080p.Bluray.2...


In [13]:
# sub_id = 275841
sub_id = response_json_sub_ids[0]['id']
response = get_subtitle_file(sub_id)


In [None]:
# if response:
#     print("Subtitle fetched successfully!")
#     # Save the file, print the response content, or handle it as needed
#     with open(f"subtitle_{sub_id}.zip", "wb") as f:
#         f.write(response.content)
# else:
#     print("Failed to fetch subtitle.")


# !unzip "/sise/home/lielbin/The-Art-of-Analyzing-Big-Data/subtitle_337328.zip" -d "/sise/home/lielbin/The-Art-of-Analyzing-Big-Data/unzipped_subtitle_337328"

# srt_file = "/sise/home/lielbin/The-Art-of-Analyzing-Big-Data/unzipped_subtitle_337328/Who's.the.Boss.-.S01E01.-.Pilot.DVDRip.Heb.srt"

# # Read the file
# subtitles_heb = pysrt.open(srt_file)

# # Print the subtitles
# for subtitle in subtitles_heb[:10]:
#     print(f"Index: {subtitle.index}")
#     print(f"Start: {subtitle.start}")
#     print(f"End: {subtitle.end}")
#     print(f"Text: {subtitle.text}")
#     print("-" * 20)

In [None]:
def get_subtitle_text(sub_id):
    """
    Sends a GET request to the Wizdom API to fetch the subtitle file for a given subID
    and extracts the text content from the subtitle file, assuming it's plain text.
    
    Args:
        sub_id (int): The subtitle ID to fetch.
        
    Returns:
        str: The text content of the subtitle file.
    """
    base_url = "https://wizdom.xyz/api/files/sub/"
    url = f"{base_url}{sub_id}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        
        # Open the response content as a zip file
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            # Find the first text file (or any file) in the zip archive
            for file_name in z.namelist():
                if file_name.endswith(".txt") or file_name:  # Adjust for the file format
                    with z.open(file_name) as subtitle_file:
                        # Decode and return the text content
                        return subtitle_file.read().decode("utf-8")
        
        print("No suitable file found in the zip archive.")
        return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching the subtitle: {e}")
        return None
    except zipfile.BadZipFile:
        print("The downloaded file is not a valid zip file.")
        return None

In [None]:
def extract_clean_text_from_srt(subtitle_text):
    """
    Extracts clean text from an SRT-formatted subtitle, removing indexes and timestamps.
    
    Args:
        subtitle_text (str): The content of the subtitle in SRT format.
        
    Returns:
        str: A single string containing the clean text of the subtitle.
    """
    # Split the SRT content into lines
    lines = subtitle_text.splitlines()
    
    # Define a regex pattern for timestamps
    timestamp_pattern = re.compile(r"^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$")
    
    # Initialize a list to hold the clean lines
    clean_lines = []
    
    # Iterate through lines and filter out index numbers and timestamps
    for line in lines:
        if line.isdigit():  # Skip index lines
            continue
        if timestamp_pattern.match(line):  # Skip timestamp lines
            continue
        if line.strip() == "":  # Skip empty lines
            continue
        clean_lines.append(line)
    
    # Join the remaining lines into a single string
    return " ".join(clean_lines)

In [None]:
clean_subtitle_text = extract_clean_text_from_srt(subtitle_text)
print(clean_subtitle_text)

In [None]:
# Initialize the tokenizer and model for alephBERT-base
tokenizer = AutoTokenizer.from_pretrained("onlplab/alephbert-base")
model = AutoModel.from_pretrained("onlplab/alephbert-base")

# Example Hebrew subtitles (replace this with your actual subtitles text)
# subtitles_all_episode_heb = "זהו טקסט לדוגמה של כתוביות בפרק."

# Tokenize the input
inputs = tokenizer(subtitles_all_episode_heb, return_tensors="pt", truncation=True, padding=True)

# Get model outputs
outputs = model(**inputs)

# Mean pooling to create a single sentence vector
sentence_vector_heb_sub = outputs.last_hidden_state.mean(dim=1)  # Mean pooling

# Print the resulting vector
print("Sentence Vector:", sentence_vector_heb_sub)

In [1]:
from transformers import pipeline
import torch

# This loads the model onto the GPU in bfloat16 precision
model = pipeline('text-generation', 'dicta-il/dictalm2.0', torch_dtype=torch.bfloat16, device_map='cuda')

  from .autonotebook import tqdm as notebook_tqdm
2024-12-30 14:13:28.331097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735560808.351753 3452771 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735560808.358115 3452771 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-30 14:13:28.379739: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]


In [2]:
# Sample few shot examples
prompt = """
עבר: הלכתי
עתיד: אלך

עבר: שמרתי
עתיד: אשמור

עבר: שמעתי
עתיד: אשמע

עבר: עשיתי
עתיד:
"""

print(model(prompt.strip(), do_sample=False, max_new_tokens=8, stop_sequence='\n'))
# [{'generated_text': 'עבר: הלכתי\nעתיד: אלך\n\nעבר: שמרתי\nעתיד: אשמור\n\nעבר: שמעתי\nעתיד: אשמע\n\nעבר: הבנתי\nעתיד: אבין\n\n'}]

Setting `pad_token_id` to `eos_token_id`:28705 for open-end generation.


[{'generated_text': 'עבר: הלכתי\nעתיד: אלך\n\nעבר: שמרתי\nעתיד: אשמור\n\nעבר: שמעתי\nעתיד: אשמע\n\nעבר: עשיתי\nעתיד: אעשה\n\nעבר: ה'}]


In [3]:
# Generate the response
response = model(prompt.strip(), do_sample=False, max_new_tokens=8, stop_sequence='\n')
print(response)


Setting `pad_token_id` to `eos_token_id`:28705 for open-end generation.


[{'generated_text': 'עבר: הלכתי\nעתיד: אלך\n\nעבר: שמרתי\nעתיד: אשמור\n\nעבר: שמעתי\nעתיד: אשמע\n\nעבר: עשיתי\nעתיד: אעשה\n\nעבר: ה'}]


In [5]:
prompt = """
תסכם את הטקסט הבא:
בעשורים האחרונים, המדע והטכנולוגיה שינו את פני האנושות באופן מהותי. בזכות פריצות דרך בתחומי הרפואה, האנרגיה והתחבורה, חיינו הפכו לבטוחים, נוחים ויעילים יותר. עם זאת, ההתקדמות מעוררת גם דאגות אתיות.
"""


In [7]:
# Generate the response
response = model(prompt.strip(), do_sample=False, max_new_tokens=100)  # Adjust max_new_tokens for longer summaries
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'תסכם את הטקסט הבא:\nבעשורים האחרונים, המדע והטכנולוגיה שינו את פני האנושות באופן מהותי. בזכות פריצות דרך בתחומי הרפואה, האנרגיה והתחבורה, חיינו הפכו לבטוחים, נוחים ויעילים יותר. עם זאת, ההתקדמות מעוררת גם דאגות אתיות.\nהאם אתה מסכים עם הטענה הבאה:\nההתקדמות המדעית והטכנולוגית בעשורים האחרונים שינתה את פני האנושות באופן מהותי. בזכות פריצות דרך בתחומי הרפואה, האנרגיה והתחבורה, חיינו הפכו לבטוחים, נוחים ויעילים יותר. עם זאת, ההתקדמות מעוררת גם ד'}]
