### Importation et initialisation de findspark

In [5]:
import findspark
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64' # Set your own
os.environ['JAVA_HOME'] = java8_location
findspark.init()

## Importation des modules requises

In [24]:
# Pyspark modules
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, StringType, StructType, StructField, IntegerType
from pyspark.sql import Row
from pyspark.sql.functions import lit
import gensim
import i18n
from googletrans import Translator
from gensim import models
from gensim.models import Word2Vec, KeyedVectors
from math import radians, cos, sin, asin, sqrt
import json

In [7]:
spark = SparkSession \
    .builder \
    .appName("TransBigData") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

sc = spark.sparkContext

#### Preparation des Points d'interets au SENEGAL

In [8]:
# Schema
schema2 = StructType([\
    StructField("geoname_id", IntegerType(), True),\
    StructField("POI_name", StringType(), True),\
    StructField("POI_asciiname", StringType(), True),\
    StructField("POI_alternatenames", StringType(), True),\
    StructField("POI_latitude", FloatType(), True),\
    StructField("POI_longitude", FloatType(), True),\
    StructField("feature_class", StringType(), True),\
    StructField("feature_code", StringType(), True),\
    StructField("country_code", StringType(), True),\
    StructField("cc2", StringType(), True),\
    StructField("admin1_code", StringType(), True),\
    StructField("admin2_code", StringType(), True),\
    StructField("admin3_code", StringType(), True),\
    StructField("admin4_code", StringType(), True),\
    StructField("population", IntegerType(), True),\
    StructField("elevation", IntegerType(), True),\
    StructField("dem", IntegerType(), True),\
    StructField("timezone", StringType(), True),\
    StructField("Modification_date", StringType(), True),\
    StructField("AdminCodes", StringType(), True)])

# Importons le dataset contenant les POI du Senegal
df_POI_SN = spark.read.csv("../SN.txt", sep="\t", header=True, schema=schema2)

#On drop les features inutiles pour nous
df_POI_SN = df_POI_SN.drop(
    "feature_class",\
    "feature_code",\
    "country_code",\
    "cc2",\
    "admin1_code",\
    "admin2_code",\
    "admin3_code",\
    "admin4_code",\
    "population",\
    "elevation",\
    "dem",\
    "timezone",\
    "Modification_date",\
    "AdminCodes")

In [9]:
df_POI_SN.show()

+----------+--------------------+--------------------+--------------------+------------+-------------+
|geoname_id|            POI_name|       POI_asciiname|  POI_alternatenames|POI_latitude|POI_longitude|
+----------+--------------------+--------------------+--------------------+------------+-------------+
|   2243938|   Vallée du Loumbol|   Vallee du Loumbol|Vallee du Loumbel...|        15.4|    -13.93333|
|   2243939|          Ziguinchor|          Ziguinchor|Basse Casamance,D...|        12.8|    -16.36667|
|   2243940|          Ziguinchor|          Ziguinchor|ZIG,Zighinkor,Zig...|    12.56801|    -16.27326|
|   2243941|          Ziguinchor|          Ziguinchor|                null|    12.58333|    -16.26667|
|   2243942|  Ziguinchor Airport|  Ziguinchor Airport|GOGG,ZIG,Ziguinch...|    12.55569|    -16.28268|
|   2243943|          Ziguinchor|          Ziguinchor|Departement de Zi...|    12.52201|    -16.21621|
|   2243944|                Yoye|                Yoye|                nul

#### Preparation du dataset du SENEGAL

In [10]:
df_SN = spark.read.csv("../fichiertest.tsv", sep="\t", header=True)

# On enlève les photos ne comportant pas de géotags et de description
df_filtre = df_SN.filter(~df_SN["longitude"].isin(""))
df_filtre = df_filtre.filter(~df_SN["latitude"].isin(""))
df_filtre = df_filtre.filter(~df_SN["description"].isin(""))

In [22]:
#df_SN.show()
#df_filtre.show(1)
item = df_filtre.collect()[0]
str(item['identifier'])

'3170611706'

#### Definition fonction heversine

In [14]:
# Fonction permettant de calculer la distance entre deux points geographiques en 
# Fonction de leurs coordonnees
#def haversine(lon1, lat1, lon2, lat2):
#    """
#    Calculate the great circle distance between two points 
#    on the earth (specified in decimal degrees)
#    """
#    # convert decimal degrees to radians 
#    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
#    # haversine formula 
#    dlon = lon2 - lon1 
#    dlat = lat2 - lat1 
#    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
#    c = 2 * asin(sqrt(a)) 
#    # Radius of earth in kilometers is 6371
#    km = 6371* c
#    return km

In [None]:
def queryPOI(longitude,latitude,POI_longitude,POI_latitude):
    longitude = float(longitude)
    latitude = float(latitude)
    POI_longitude = float(POI_longitude)
    POI_latitude = float(POI_latitude)
    radius =50
    
    ## Calculons les coordonnées de la zone à couvrir par la bounding box 
    R = 6371000 # Rayon de la Terre en mètres
    dY = 360 * radius / R # radius est le rayon de couverture de notre zone
    dX = dY * math.cos (math.radians (latitude))
    left = longitude - dX
    bottom = latitude - dY 
    right = longitude + dX
    top = latitude + dY
    Liste_POI = list()
    if((left <= POI_longitude <= right) and (bottom <= POI_latitude <= top)):
        Liste_POI.append(POI_name)
    
    return Liste_POI

#### Definition fonction Voisinage 

In [15]:
# Cette fonction prend en entree la df des POIs du senegal et une photo du dataset et retourne les POIs qui sont dans un rayon R de la phot
def NeighboursItem(df,item):
    alttd = None
    item_lat = float(item['Latitude'])
    item_lon = float(item['Longitude'])
    lgd = None
    neigh = []
    for i in range(0, df.count()):
        lat = float(df.select('POI_latitude').collect()[i][0])
        lon = float(df.select('POI_longitude').collect()[i][0])
        if(haversine(item_lon, item_lat, lon, lat) < 1.5) :
            neigh.append(df.collect()[i])
            print(df.collect()[i])
        i+=1
    return neigh

In [None]:
data = NeighboursItem(df_POI_SN, item)
try:
    with open('../data/data'+item['identifier']+'.json', 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
except:
    print("file already existe !!!!")

In [None]:
# Fonction permettant de calculer la distance entre deux points geographiques en 
# Fonction de leurs coordonnees
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

# Fonction pour traduire les user_tags en Francais pour les 
# Points preselectionnees
from googletrans import Translator
def translate(list_words, dest):
    trans_words = []
    for word in list_words:
        if(i18n.t('En.{}'.format(word)).startswith('En')):
            word_trans = (translator.translate(word, dest='{}'.format(dest))).text
            try:
                fichier = open('translations/En.en.yml', 'a')
                fichier.writelines('\n  {}: {}'.format(word, word_trans))
                fichier.close()
            except:
                write("Some Error occur")
        else:
            word_trans = i18n.t("En.{}".format(word))
        trans_words.append(word_trans)
    return trans_words
            