In [1]:
import pandas as pd

airport_df = pd.read_csv("airport.csv")

In [2]:
airport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17721 entries, 0 to 17720
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   airport_name                 17721 non-null  object 
 1   link                         17721 non-null  object 
 2   title                        17721 non-null  object 
 3   author                       17721 non-null  object 
 4   author_country               12777 non-null  object 
 5   date                         17721 non-null  object 
 6   content                      17721 non-null  object 
 7   experience_airport           647 non-null    object 
 8   date_visit                   593 non-null    object 
 9   type_traveller               646 non-null    object 
 10  overall_rating               13796 non-null  float64
 11  queuing_rating               12813 non-null  float64
 12  terminal_cleanliness_rating  12815 non-null  float64
 13  terminal_seating

#### Analiziranje recenzija aerodorma. Računa se dužina svake recenzije i prikazuje prosečna, minimalna, maksimalna vrednost i percentili, kako bi se izabrao odgovarajući pristup prilikom sentiment analize. Imena aerodroma se formatiraju da budu čitljiva. Zatim se broji koliko recenzija ima svaki aerodrom i sortira po najvećem broju. Prikazuje se top 20 aerodroma po broju recenzija, a za njih se računa prosečna ocena i sortira od najbolje prema lošije ocenjenom, kako bi se rezultati mogli kasnije uporediti sa generisanim opisima i proceniti njihova relevantnost.

In [3]:
print("Analiza dužine recenzija (broj tokena):")
print(f"Prosečan broj tokena: {airport_df['content'].astype(str).apply(lambda x: len(x.split())).mean():.2f}")
print(f"25. percentil: {airport_df['content'].astype(str).apply(lambda x: len(x.split())).quantile(0.25):.0f}")
print(f"50. percentil (medijana): {airport_df['content'].astype(str).apply(lambda x: len(x.split())).quantile(0.50):.0f}")
print(f"75. percentil: {airport_df['content'].astype(str).apply(lambda x: len(x.split())).quantile(0.75):.0f}")
print(f"Minimalan broj tokena: {airport_df['content'].astype(str).apply(lambda x: len(x.split())).min()}")
print(f"Maksimalan broj tokena: {airport_df['content'].astype(str).apply(lambda x: len(x.split())).max()}")


Analiza dužine recenzija (broj tokena):
Prosečan broj tokena: 115.57
25. percentil: 64
50. percentil (medijana): 98
75. percentil: 146
Minimalan broj tokena: 9
Maksimalan broj tokena: 933


In [4]:
airport_df['airport_name'] = (
    airport_df['airport_name']
        .str.replace('-', ' ', regex=False)
        .str.title()
)


In [5]:
airport_counts = airport_df.groupby('airport_name')['content'].count().reset_index()

airport_counts.rename(columns={'content': 'num_reviews'}, inplace=True)

airport_counts = airport_counts.sort_values(by='num_reviews', ascending=False)
print(airport_counts)


                airport_name  num_reviews
410  London Heathrow Airport          520
411  London Stansted Airport          402
437       Manchester Airport          303
519        Paris Cdg Airport          301
210            Dubai Airport          279
..                       ...          ...
709         Wakkanai Airport            1
719  Westerland Sylt Airport            1
720          Wichita Airport            1
721             Wick Airport            1
1             Aarhus Airport            1

[741 rows x 2 columns]


In [6]:
top_airports = airport_counts.head(20)
print(top_airports)


                     airport_name  num_reviews
410       London Heathrow Airport          520
411       London Stansted Airport          402
437            Manchester Airport          303
519             Paris Cdg Airport          301
210                 Dubai Airport          279
420                 Luton Airport          275
409        London Gatwick Airport          252
60   Bangkok Suvarnabhumi Airport          220
242        Frankfurt Main Airport          218
414       Los Angeles Lax Airport          199
455                 Miami Airport          191
488          New York Jfk Airport          185
624      Singapore Changi Airport          181
392        Leeds Bradford Airport          166
675       Toronto Pearson Airport          166
31     Amsterdam Schiphol Airport          166
304             Hong Kong Airport          162
366     Klia Kuala Lumpur Airport          160
581        Rome Fiumicino Airport          155
116               Bristol Airport          154


In [7]:
top_airport_names = top_airports['airport_name'].tolist()

top_airport_df = airport_df[airport_df['airport_name'].isin(top_airport_names)]

airport_sentiment_summary = (
    top_airport_df
    .groupby('airport_name')['overall_rating'] 
    .mean()
    .round(2)
    .reset_index()
)

airport_sentiment_summary = airport_sentiment_summary.sort_values(by='overall_rating', ascending=False)

print(airport_sentiment_summary)


                    airport_name  overall_rating
18      Singapore Changi Airport            7.09
5              Hong Kong Airport            6.20
6      Klia Kuala Lumpur Airport            5.56
0     Amsterdam Schiphol Airport            5.55
9        London Heathrow Airport            4.85
8         London Gatwick Airport            4.53
1   Bangkok Suvarnabhumi Airport            4.27
13            Manchester Airport            4.08
19       Toronto Pearson Airport            4.06
3                  Dubai Airport            3.78
4         Frankfurt Main Airport            3.72
15          New York Jfk Airport            3.59
17        Rome Fiumicino Airport            3.50
2                Bristol Airport            3.44
11       Los Angeles Lax Airport            3.32
16             Paris Cdg Airport            3.24
7         Leeds Bradford Airport            3.21
14                 Miami Airport            3.03
12                 Luton Airport            2.89
10       London Stan

#### Kreiranje sažetih opisa aerodroma na srpskom jeziku. Za top 20 aerodroma analiziraju se sve recenzije, generiše se kratka rečenica na engleskom pomoću FLAN-T5 modela, a zatim se prevodi na srpski korišćenjem Helsinki-NLP modela.

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import pandas as pd

translator_name = "Helsinki-NLP/opus-mt-tc-base-en-sh"
translator_tokenizer = AutoTokenizer.from_pretrained(translator_name)
translator_model = AutoModelForSeq2SeqLM.from_pretrained(translator_name)

def translate_to_serbian(text):
    inputs = translator_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = translator_model.generate(**inputs, max_new_tokens=200)
    return translator_tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_all_reviews(df, airport_name):
    return df[df['airport_name'] == airport_name]['content'].dropna().tolist()


summ_model = "google/flan-t5-large"
summ_tokenizer = AutoTokenizer.from_pretrained(summ_model)
summ_model = AutoModelForSeq2SeqLM.from_pretrained(summ_model)

def summarize_airport(reviews):
    combined_text = " ".join(reviews)
    input_text = f"Summarize the following airport reviews into one concise sentence.: {combined_text}"

    inputs = summ_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = summ_model.generate(**inputs, max_new_tokens=80, num_beams=5, early_stopping=True)
    return summ_tokenizer.decode(outputs[0], skip_special_tokens=True)

airport_summaries = []

for airport in top_airport_names:
    reviews = get_all_reviews(airport_df, airport)
    if not reviews:
        print(f"Nema recenzija za {airport}")
        continue

    summary_en = summarize_airport(reviews)

    summary_sr = translate_to_serbian(summary_en)

    airport_summaries.append({
        "airport_name": airport,
        "summary_sentence": summary_sr
    })

    print("\n") 
    print(f"Sumarni opis za {airport}: {summary_sr}")

summary_df = pd.DataFrame(airport_summaries)






Sumarni opis za London Heathrow Airport: Bilo je malo usko grlo u šetnji od kapije pa sve do vozova koji su odvodili ljude sa udaljenih satelitskih terminala do 'glavnog' dijela T5.


Sumarni opis za London Stansted Airport: To je noćna mora.


Sumarni opis za Manchester Airport: Nije najbolji aerodrom na svijetu, ali nije ni najgori.


Sumarni opis za Paris Cdg Airport: Nikada se neću vratiti na ovaj aerodrom.


Sumarni opis za Dubai Airport: Jednostavno je prevelika, teško zagušena i teška za navigaciju.


Sumarni opis za Luton Airport: Aerodrom nudi letove s niskim troškovima koji nemaju infrastrukturu.


Sumarni opis za London Gatwick Airport: Gatwick North Terminal je reorganizirao svoje sigurnosne usluge


Sumarni opis za Bangkok Suvarnabhumi Airport: To je bolji aerodrom nego što je nekad bio.


Sumarni opis za Frankfurt Main Airport: Osoblje bez poštovanja sigurnosti


Sumarni opis za Los Angeles Lax Airport: Najgori aerodrom u kojem sam bio na svojim putovanjima.


Sumarni o

#### Upis rezultata u bazu podataka

In [10]:
import psycopg2

try:
    conn = psycopg2.connect(
        host="localhost",
        database="airline_recommendations_db",
        user="postgres",
        password="postgres",
        port=5432
    )
    cursor = conn.cursor()

    cursor.execute("""
        CREATE TABLE IF NOT EXISTS airport_sentiment_summary_serbian (
            id SERIAL PRIMARY KEY,
            airport_name TEXT UNIQUE,
            summary_sentence TEXT
        );
    """)
    conn.commit()

    for _, r in summary_df.iterrows():
        cursor.execute("""
            INSERT INTO airport_sentiment_summary_serbian
            (airport_name, summary_sentence)
            VALUES (%s, %s)
        """, (
            r['airport_name'],
            r['summary_sentence']
        ))

    conn.commit()
    cursor.close()
    conn.close()

    print("Podaci o aerodromima su uspešno upisani u PostgreSQL bazu.")

except Exception as e:
    print("Greška pri upisu u bazu:", e)


Podaci o aerodromima su uspešno upisani u PostgreSQL bazu.
