In [7]:
import pandas as pd

airline_df = pd.read_csv("airline_top20_df_filtered.csv")
lounge_df = pd.read_csv("lounge_top20_df_filtered.csv")
seat_df = pd.read_csv("seat_top20_df_filtered.csv")

#### Kombinovanje mašinskog učenja i analize sentimenta kako bi se rangirale avio-kompanije. Najpre se trenira Gradient Boosting model na numeričkim karakteristikama kako bi odredio njihove težine/značaj. Zatim se koristi sentiment analiza teksta recenzija tako da se kratki tekstovi obrađuju preko VADER-a, duži preko BERT-based modela. Rezultati ML modela i sentimenta se kombinuju, a ocena se dodatno prilagođava na osnovu preporuke korisnika (atribut recommended). Na kraju se računa prosečna ocena po avio-kompaniji i vraćaju top avio-kompanije sa njihovim težinama.

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline


sia = SentimentIntensityAnalyzer()
bert_sentiment = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    truncation=True
)

def compute_top_airlines(df, feature_columns, dataset_name="Dataset", text_column='content', recommended_column='recommended', top_n=20, token_threshold=250):
    
    df_copy = df.copy()
    
    X = df_copy[feature_columns]
    y = df_copy['overall_rating']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    weights = model.feature_importances_
    print(f"ML Težine ({dataset_name}):")
    for col, w in zip(feature_columns, weights):
        print(f"{col}: {w:.3f}")
    
    df_copy['ml_score'] = df_copy.apply(lambda row: sum(row[col] * w for col, w in zip(feature_columns, weights)), axis=1)
    
    ml_min, ml_max = df_copy['ml_score'].min(), df_copy['ml_score'].max()
    df_copy['ml_score_norm'] = 1 + 9 * (df_copy['ml_score'] - ml_min) / (ml_max - ml_min)
    

    def compute_sentiment(text):
        if len(text.split()) <= token_threshold: 
            return 1 + 9 * ((sia.polarity_scores(text)['compound'] + 1) / 2)
        else:  
            bert_result = bert_sentiment(text[:512])[0]  
            stars = int(bert_result['label'].split()[0])
            return (stars - 1) * 9 / 4 + 1
    
    df_copy['sentiment_score'] = df_copy[text_column].apply(compute_sentiment)
    
    df_copy['final_score'] = 0.7 * df_copy['ml_score_norm'] + 0.3 * df_copy['sentiment_score']

    def adjust(row):
        score = row['final_score']
        if row[recommended_column] == 'yes':
            score *= 1.05
        elif row[recommended_column] == 'no':
            score *= 0.95
        return min(max(score, 1), 10)
    
    df_copy['final_score_adjusted'] = df_copy.apply(adjust, axis=1)
    
    top_airlines = (
        df_copy.groupby('airline_name')['final_score_adjusted']
        .mean()
        .round(2)
        .sort_values(ascending=False)
        .head(top_n)
    )
    
    return top_airlines, weights


Device set to use cpu


In [9]:
airline_columns = ['value_money_rating', 'seat_comfort_rating', 'cabin_staff_rating', 'food_beverages_rating']
top_airlines,weights_airline  = compute_top_airlines(airline_df, airline_columns, dataset_name="Airline Reviews")
print("\nTOP 20 AVIOKOMPANIJA (Airline Reviews):")
print(top_airlines)
print("\n")

lounge_columns = ['comfort_rating','cleanliness_rating','bar_beverages_rating','catering_rating','washrooms_rating','wifi_connectivity_rating','staff_service_rating']
top_airlines_by_lounges,weights_lounge = compute_top_airlines(lounge_df, lounge_columns, dataset_name="Lounge Reviews")
print("\nTOP 20 AVIOKOMPANIJA (Lounge Reviews):")
print(top_airlines_by_lounges)
print("\n")

seat_columns = ['seat_legroom_rating','seat_recline_rating','seat_width_rating','aisle_space_rating','viewing_tv_rating']
top_airlines_by_seats,weights_seat = compute_top_airlines(seat_df, seat_columns, dataset_name="Seat Reviews")
print("\nTOP 20 AVIOKOMPANIJA (Seat Reviews):")
print(top_airlines_by_seats)

ML Težine (Airline Reviews):
value_money_rating: 0.716
seat_comfort_rating: 0.060
cabin_staff_rating: 0.156
food_beverages_rating: 0.068

TOP 20 AVIOKOMPANIJA (Airline Reviews):
airline_name
Qatar Airways               8.37
Singapore Airlines          8.35
Lufthansa                   7.98
Thai Airways                7.89
Qantas Airways              7.86
Cathay Pacific Airways      7.79
Malaysia Airlines           7.72
Klm Royal Dutch Airlines    7.71
Emirates                    7.38
Turkish Airlines            7.25
British Airways             6.97
Virgin Atlantic Airways     6.69
Jet Airways                 6.64
Air France                  6.59
Delta Air Lines             6.56
Etihad Airways              6.29
Air Canada                  5.87
American Airlines           5.23
United Airlines             5.14
Us Airways                  4.88
Name: final_score_adjusted, dtype: float64


ML Težine (Lounge Reviews):
comfort_rating: 0.190
cleanliness_rating: 0.143
bar_beverages_rating: 0.184


#### Pravljenje tekstualnog opisa najvažnijih faktora za ocene. Atributi se sortiraju po težini i pretvaraju se u nazive na srpskom. Zatim se kreira rečenica koja ističe koje dve stvari najviše utiču, a koje manje.

In [12]:

def generate_description_universal(columns, weights, col_map=None):
    
    if col_map is None:
        col_map = {col: col for col in columns}
    
    aspects_weights = sorted(zip(columns, weights), key=lambda x: x[1], reverse=True)
    
    parts = [col_map.get(col, col) for col, w in aspects_weights]
    
    # kreiranje rečenice
    #if len(parts) >= 2:
    #    sentence = f"Ukupne ocene najviše zavise od {parts[0]} i {parts[1]}, dok {', '.join(parts[2:])} imaju manji uticaj."
    #else:
    #    sentence = f"Ukupne ocene najviše zavise od {', '.join(parts)}."
    
    sentence = f"Ukupne ocene najviše zavise od {parts[0]} i {parts[1]}, dok {', '.join(parts[2:])} imaju manji uticaj."
    
    return sentence


airline_col_map = {
    'value_money_rating': 'odnos cene i kvaliteta',
    'seat_comfort_rating': 'komfor sedišta',
    'cabin_staff_rating': 'osoblje u kabini',
    'food_beverages_rating': 'hrana i piće'
}
description_airline = generate_description_universal(airline_columns, weights_airline, airline_col_map)
print("\nAirline opis važnosti aspekata:")
print(description_airline)


lounge_col_map = {
    'comfort_rating': 'komfor',
    'cleanliness_rating': 'čistoća',
    'bar_beverages_rating': 'piće u baru',
    'catering_rating': 'hranjenje',
    'washrooms_rating': 'toaleti',
    'wifi_connectivity_rating': 'wifi konekcija',
    'staff_service_rating': 'osoblje'
}
description_lounge = generate_description_universal(lounge_columns, weights_lounge, lounge_col_map)
print("\nLounge opis važnosti aspekata:")
print(description_lounge)


seat_col_map = {
    'seat_legroom_rating': 'prostor za noge',
    'seat_recline_rating': 'naslon sedišta',
    'seat_width_rating': 'širina sedišta',
    'aisle_space_rating': 'prostor uz prolaz',
    'viewing_tv_rating': 'TV ekran'
}
description_seat = generate_description_universal(seat_columns, weights_seat, seat_col_map)
print("\nSeat opis važnosti aspekata:")
print(description_seat)

description_df = pd.DataFrame([
    {"dataset": "airline", "description": description_airline},
    {"dataset": "lounge", "description": description_lounge},
    {"dataset": "seat", "description": description_seat}
])


Airline opis važnosti aspekata:
Ukupne ocene najviše zavise od odnos cene i kvaliteta i osoblje u kabini, dok hrana i piće, komfor sedišta imaju manji uticaj.

Lounge opis važnosti aspekata:
Ukupne ocene najviše zavise od osoblje i komfor, dok piće u baru, hranjenje, čistoća, toaleti, wifi konekcija imaju manji uticaj.

Seat opis važnosti aspekata:
Ukupne ocene najviše zavise od prostor za noge i širina sedišta, dok naslon sedišta, prostor uz prolaz, TV ekran imaju manji uticaj.


#### Upis podataka u bazu

In [16]:
import psycopg2

DB_HOST = "localhost"
DB_NAME = "airline_recommendations_db"
DB_USER = "postgres"
DB_PASSWORD = "postgres"
DB_PORT = 5432


try:
    conn = psycopg2.connect(
        host=DB_HOST,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        port=DB_PORT
    )
    cursor = conn.cursor()

    cursor.execute("TRUNCATE TABLE airline_ratings_simple;")
    cursor.execute("TRUNCATE TABLE lounge_ratings_simple;")
    cursor.execute("TRUNCATE TABLE seat_ratings_simple;")

    for airline, score in top_airlines.items():
        cursor.execute(
            "INSERT INTO airline_ratings_simple (airline_name, overall_rating) VALUES (%s, %s);",
            (airline, float(score))
        )

    for airline, score in top_airlines_by_lounges.items():
        cursor.execute(
            "INSERT INTO lounge_ratings_simple (airline_name, overall_rating) VALUES (%s, %s);",
            (airline, float(score))
        )

    for airline, score in top_airlines_by_seats.items():
        cursor.execute(
            "INSERT INTO seat_ratings_simple (airline_name, overall_rating) VALUES (%s, %s);",
            (airline, float(score))
        )

    conn.commit()
    print("\nPodaci uspešno upisani u PostgreSQL!")

except Exception as e:
    print("Greška prilikom upisa u bazu:", e)

finally:
    if cursor:
        cursor.close()
    if conn:
        conn.close()



Podaci uspešno upisani u PostgreSQL!


In [17]:
import psycopg2

DB_HOST = "localhost"
DB_NAME = "airline_recommendations_db"
DB_USER = "postgres"
DB_PASSWORD = "postgres"
DB_PORT = 5432


try:
    conn = psycopg2.connect(
        host=DB_HOST,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        port=DB_PORT
    )
    cursor = conn.cursor()

    cursor.execute("TRUNCATE TABLE aspect_importance_airline;")

    for _, row in description_df.iterrows():
        cursor.execute(
            "INSERT INTO aspect_importance_airline (dataset, description) VALUES (%s, %s);",
            (row['dataset'], row['description'])
        )

    conn.commit()
    print("Opis važnosti aspekata uspešno upisan u bazu!")

except Exception as e:
    print("Greška prilikom upisa u bazu:", e)

finally:
    if cursor:
        cursor.close()
    if conn:
        conn.close()


Opis važnosti aspekata uspešno upisan u bazu!
