In [5]:
import pandas as pd

airport_df = pd.read_csv("airport.csv")

In [6]:
airport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17721 entries, 0 to 17720
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   airport_name                 17721 non-null  object 
 1   link                         17721 non-null  object 
 2   title                        17721 non-null  object 
 3   author                       17721 non-null  object 
 4   author_country               12777 non-null  object 
 5   date                         17721 non-null  object 
 6   content                      17721 non-null  object 
 7   experience_airport           647 non-null    object 
 8   date_visit                   593 non-null    object 
 9   type_traveller               646 non-null    object 
 10  overall_rating               13796 non-null  float64
 11  queuing_rating               12813 non-null  float64
 12  terminal_cleanliness_rating  12815 non-null  float64
 13  terminal_seating

In [7]:
lengths = airport_df['content'].astype(str).str.len()

print("Analiza dužine recenzija (broj karaktera):")
print(f"Prosečna dužina: {lengths.mean():.2f}")
print(f"25. percentil: {lengths.quantile(0.25):.0f}")
print(f"50. percentil (medijana): {lengths.quantile(0.50):.0f}")
print(f"75. percentil: {lengths.quantile(0.75):.0f}")
print(f"Minimalna dužina: {lengths.min()}")
print(f"Maksimalna dužina: {lengths.max()}")


Analiza dužine recenzija (broj karaktera):
Prosečna dužina: 645.69
25. percentil: 361
50. percentil (medijana): 550
75. percentil: 814
Minimalna dužina: 52
Maksimalna dužina: 5122


In [8]:
airport_df['airport_name'] = (
    airport_df['airport_name']
        .str.replace('-', ' ', regex=False)
        .str.title()
)


In [9]:
# Grupisanje po airport_name i broj recenzija
airport_counts = airport_df.groupby('airport_name')['content'].count().reset_index()

# Preimenuj kolonu radi preglednosti
airport_counts.rename(columns={'content': 'num_reviews'}, inplace=True)

# Sortiranje po broju recenzija
airport_counts = airport_counts.sort_values(by='num_reviews', ascending=False)
print(airport_counts)


                airport_name  num_reviews
410  London Heathrow Airport          520
411  London Stansted Airport          402
437       Manchester Airport          303
519        Paris Cdg Airport          301
210            Dubai Airport          279
..                       ...          ...
709         Wakkanai Airport            1
719  Westerland Sylt Airport            1
720          Wichita Airport            1
721             Wick Airport            1
1             Aarhus Airport            1

[741 rows x 2 columns]


In [10]:
top_airports = airport_counts.head(20)
print(top_airports)


                     airport_name  num_reviews
410       London Heathrow Airport          520
411       London Stansted Airport          402
437            Manchester Airport          303
519             Paris Cdg Airport          301
210                 Dubai Airport          279
420                 Luton Airport          275
409        London Gatwick Airport          252
60   Bangkok Suvarnabhumi Airport          220
242        Frankfurt Main Airport          218
414       Los Angeles Lax Airport          199
455                 Miami Airport          191
488          New York Jfk Airport          185
624      Singapore Changi Airport          181
392        Leeds Bradford Airport          166
675       Toronto Pearson Airport          166
31     Amsterdam Schiphol Airport          166
304             Hong Kong Airport          162
366     Klia Kuala Lumpur Airport          160
581        Rome Fiumicino Airport          155
116               Bristol Airport          154


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import torch

# ===============================
# 1. MODEL ZA PREVOD (EN → SR)
# ===============================

translator_name = "Helsinki-NLP/opus-mt-tc-base-en-sh"
translator_tokenizer = AutoTokenizer.from_pretrained(translator_name)
translator_model = AutoModelForSeq2SeqLM.from_pretrained(translator_name)

def translate_to_serbian(text):
    inputs = translator_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )
    outputs = translator_model.generate(
        **inputs,
        max_new_tokens=200
    )
    return translator_tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

# ===============================
# 2. MODEL ZA SUMARIZACIJU
# ===============================

summ_model_name = "google/flan-t5-large"
summ_tokenizer = AutoTokenizer.from_pretrained(summ_model_name)
summ_model = AutoModelForSeq2SeqLM.from_pretrained(summ_model_name)

# ===============================
# 3. UČITAVANJE I ČIŠĆENJE PODATAKA
# ===============================

def get_all_reviews(df, airport_name):
    return (
        df[df["airport_name"] == airport_name]["content"]
        .dropna()
        .tolist()
    )

def clean_reviews(reviews, min_len=50):
    cleaned = []
    for r in reviews:
        r = r.strip()
        if len(r) < min_len:
            continue
        if r.lower() in ["i flew", "i tried", "flew", "travelled"]:
            continue
        cleaned.append(r)
    return cleaned

# ===============================
# 4. HIJERARHIJSKA SUMARIZACIJA
# ===============================

def chunk_reviews(reviews, chunk_size=5):
    for i in range(0, len(reviews), chunk_size):
        yield reviews[i:i + chunk_size]

def summarize_chunk(reviews_chunk):
    combined_text = " ".join(reviews_chunk)

    prompt = (
        "Summarize the following airport reviews into ONE clear and informative sentence:\n"
        + combined_text
    )

    inputs = summ_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    outputs = summ_model.generate(
        **inputs,
        max_new_tokens=60,
        min_new_tokens=15,
        num_beams=5,
        early_stopping=True
    )

    return summ_tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

def summarize_airport_hierarchical(reviews):
    if len(reviews) == 0:
        return "No sufficient reviews available."

    chunk_summaries = []

    for chunk in chunk_reviews(reviews):
        summary = summarize_chunk(chunk)
        chunk_summaries.append(summary)

    combined_summary_text = " ".join(chunk_summaries)

    final_prompt = (
        "Summarize the following airport review summaries into ONE concise and balanced sentence:\n"
        + combined_summary_text
    )

    inputs = summ_tokenizer(
        final_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    outputs = summ_model.generate(
        **inputs,
        max_new_tokens=80,
        min_new_tokens=20,
        num_beams=5,
        early_stopping=True
    )

    return summ_tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

# ===============================
# 5. LISTA AERODROMA
# ===============================

top_airports = [
    "London Heathrow Airport",
    "London Stansted Airport",
    "Manchester Airport",
    "Paris Cdg Airport",
    "Dubai Airport",
    "Luton Airport",
    "London Gatwick Airport",
    "Bangkok Suvarnabhumi Airport",
    "Frankfurt Main Airport",
    "Los Angeles Lax Airport",
    "Miami Airport",
    "New York Jfk Airport",
    "Singapore Changi Airport",
    "Leeds Bradford Airport",
    "Toronto Pearson Airport",
    "Amsterdam Schiphol Airport",
    "Hong Kong Airport",
    "Klia Kuala Lumpur Airport",
    "Rome Fiumicino Airport",
    "Bristol Airport"
]

# ===============================
# 6. GLAVNA PETLJA
# ===============================

airport_summaries = []

for airport in top_airports:
    reviews = get_all_reviews(airport_df, airport)
    reviews = clean_reviews(reviews)

    if not reviews:
        print(f"Nema dovoljno recenzija za {airport}")
        continue

    # Sumarizacija (EN)
    summary_en = summarize_airport_hierarchical(reviews)

    # Prevod (SR)
    summary_sr = translate_to_serbian(summary_en)

    airport_summaries.append({
        "airport_name": airport,
        "summary_sentence": summary_sr
    })

    print("-" * 80)
    print(f"Sumarni opis za {airport}: {summary_sr}")

# ===============================
# 7. REZULTATI U DATAFRAME
# ===============================

summary_df = pd.DataFrame(airport_summaries)




--------------------------------------------------------------------------------
Sumarni opis za London Heathrow Airport: Heathrow je jedan od najboljih aerodroma, vrlo prijateljski osoblje, odličan raspon objekata, puno posla dok čekate svoj let i vrlo čisto.To je apsolutno ogroman i može zbuniti turiste ali to je lako dobiti okolo. Terminal 5 može biti zastrašujuća zračna luka, nikada više neću kročiti u Heathrow ako mogu pomoći, i žao mi je.
--------------------------------------------------------------------------------
Sumarni opis za London Stansted Airport: Ovaj aerodrom je nekada bio takva priča o uspjehu prije samo nekoliko godina, sada je noćna mora.
--------------------------------------------------------------------------------
Sumarni opis za Manchester Airport: Nisam ljubitelj ovog aerodroma, datiran je, i ne postoji ogromna količina posla, ali nikad nije trebalo više od 20 minuta da se stigne od kapije do kapije.
----------------------------------------------------------

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import pandas as pd

# inicijalizacija modela za prevod sa engleskog na srpski
translator_name = "Helsinki-NLP/opus-mt-tc-base-en-sh"
translator_tokenizer = AutoTokenizer.from_pretrained(translator_name)
translator_model = AutoModelForSeq2SeqLM.from_pretrained(translator_name)

def translate_to_serbian(text):
    inputs = translator_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = translator_model.generate(**inputs, max_new_tokens=200)
    return translator_tokenizer.decode(outputs[0], skip_special_tokens=True)

# funkcija koja uzima sve recenzije datog aerodorma
def get_all_reviews(df, airport_name):
    return df[df['airport_name'] == airport_name]['content'].dropna().tolist()

# inicijalizacija modela za sažimanje teksta na engleskom
summ_model = "google/flan-t5-large"
summ_tokenizer = AutoTokenizer.from_pretrained(summ_model)
summ_model = AutoModelForSeq2SeqLM.from_pretrained(summ_model)

def summarize_airport(reviews):
    combined_text = " ".join(reviews)
    input_text = f"Summarize the following airport reviews into one concise sentence.: {combined_text}"

    inputs = summ_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = summ_model.generate(**inputs, max_new_tokens=80, num_beams=5, early_stopping=True)
    return summ_tokenizer.decode(outputs[0], skip_special_tokens=True)

# lista aerodroma
top_airports = [
    "London Heathrow Airport",
    "London Stansted Airport",
    "Manchester Airport",
    "Paris Cdg Airport",
    "Dubai Airport",
    "Luton Airport",
    "London Gatwick Airport",
    "Bangkok Suvarnabhumi Airport",
    "Frankfurt Main Airport",
    "Los Angeles Lax Airport",
    "Miami Airport",
    "New York Jfk Airport",
    "Singapore Changi Airport",
    "Leeds Bradford Airport",
    "Toronto Pearson Airport",
    "Amsterdam Schiphol Airport",
    "Hong Kong Airport",
    "Klia Kuala Lumpur Airport",
    "Rome Fiumicino Airport",
    "Bristol Airport"
]

airport_summaries = []

for airport in top_airports:
    reviews = get_all_reviews(airport_df, airport)
    if not reviews:
        print(f"Nema recenzija za {airport}")
        continue

    # Sažetak na engleskom
    summary_en = summarize_airport(reviews)

    # Prevod na srpski
    summary_sr = translate_to_serbian(summary_en)

    airport_summaries.append({
        "airport_name": airport,
        "summary_sentence": summary_sr
    })

    print("-" * 80)
    print(f"Sumarni opis za {airport}: {summary_sr}")

# Sačuvaj u DataFrame
summary_df = pd.DataFrame(airport_summaries)




--------------------------------------------------------------------------------
Sumarni opis za London Heathrow Airport: Dugo čekanje.
--------------------------------------------------------------------------------
Sumarni opis za London Stansted Airport: Nema razdvajanja tokova putnika.
--------------------------------------------------------------------------------
Sumarni opis za Manchester Airport: Nisam ljubitelj ovog aerodroma, datiran je i nema mnogo toga da se uradi.
--------------------------------------------------------------------------------
Sumarni opis za Paris Cdg Airport: Ne znam.
--------------------------------------------------------------------------------
Sumarni opis za Dubai Airport: Nisam ljubitelj ovog aerodroma, prevelik je, previše zagušen i pretežak za navigaciju.
--------------------------------------------------------------------------------
Sumarni opis za Luton Airport: Ulaznice za let kojim smo putovali.
----------------------------------------------

In [8]:
import psycopg2

try:
    conn = psycopg2.connect(
        host="localhost",
        database="airline_recommendations_db",
        user="postgres",
        password="postgres",
        port=5432
    )
    cursor = conn.cursor()

    # kreiranje tabele
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS airport_sentiment_summary_serbian (
            id SERIAL PRIMARY KEY,
            airport_name TEXT UNIQUE,
            summary_sentence TEXT
        );
    """)
    conn.commit()

    # upis podataka
    for _, r in summary_df.iterrows():
        cursor.execute("""
            INSERT INTO airport_sentiment_summary_serbian
            (airport_name, summary_sentence)
            VALUES (%s, %s)
        """, (
            r['airport_name'],
            r['summary_sentence']
        ))

    conn.commit()
    cursor.close()
    conn.close()

    print("Podaci o aerodromima su uspešno upisani u PostgreSQL bazu.")

except Exception as e:
    print("Greška pri upisu u bazu:", e)


Podaci o aerodromima su uspešno upisani u PostgreSQL bazu.
